def generateGraphData(self): safePrint('Generating and uploading data files') allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True) xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00'] # filter type==DATA and site==wikipedia allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')] # By "iszero+via", e.g. a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum)) data.reset_index(inplace=True) data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1) data.drop('iszero', axis=1, inplace=True) self.createClippedData('RawData:YearDailyViaIsZero', data) self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly) self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly) allowedSubdomains = ['m', 'zero'] data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))] data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum)) data.reset_index(inplace=True) self.createClippedData('RawData:YearDailySubdomains', data) self.createPeriodData('RawData:WeeklySubdomains', data, weekly) self.createPeriodData('RawData:MonthlySubdomains', data, monthly) # create an artificial yes/no/opera sums opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')] opera['str'] = 'o' yes = allData[allData.iszero == 'y'] yes['str'] = 'y' no = allData[allData.iszero == 'n'] no['str'] = 'n' combined = opera.append(yes).append(no) data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum)) data.reset_index(inplace=True) headerFields = 'date,xcs,iszero,count' # Override "str" as "iszero" self.createClippedData('RawData:YearDailyTotals', data, headerFields) self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields) data = [] for xcsId in list(allData.xcs.unique()): byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \ .order('count', ascending=False) top = byLang.head(5) vals = list(top.iteritems()) vals.append(('other', byLang.sum() - top.sum())) valsTotal = sum([v[1] for v in vals]) / 100.0 data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals]) self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
def test_record_prefix(self, state_data): result = json_normalize(state_data[0], 'counties') expected = DataFrame(state_data[0]['counties']) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, 'counties', meta='state', record_prefix='county_') expected = [] for rec in state_data: expected.extend(rec['counties']) expected = DataFrame(expected) expected = expected.rename(columns=lambda x: 'county_' + x) expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) tm.assert_frame_equal(result, expected)
def test_record_prefix(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) tm.assert_frame_equal(result, expected) result = json_normalize( state_data, "counties", meta="state", record_prefix="county_" ) expected = [] for rec in state_data: expected.extend(rec["counties"]) expected = DataFrame(expected) expected = expected.rename(columns=lambda x: "county_" + x) expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) tm.assert_frame_equal(result, expected)
def test_record_prefix(self): result = json_normalize(self.state_data[0], 'counties') expected = DataFrame(self.state_data[0]['counties']) tm.assert_frame_equal(result, expected) result = json_normalize(self.state_data, 'counties', meta='state', record_prefix='county_') expected = [] for rec in self.state_data: expected.extend(rec['counties']) expected = DataFrame(expected) expected = expected.rename(columns=lambda x: 'county_' + x) expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data): result = json_normalize(state_data[0], 'counties') expected = DataFrame(state_data[0]['counties']) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, 'counties') expected = [] for rec in state_data: expected.extend(rec['counties']) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, 'counties', meta='state') expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, "counties") expected = [] for rec in state_data: expected.extend(rec["counties"]) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, "counties", meta="state") expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) tm.assert_frame_equal(result, expected)
def test_simple_normalize(self): result = json_normalize(self.state_data[0], 'counties') expected = DataFrame(self.state_data[0]['counties']) tm.assert_frame_equal(result, expected) result = json_normalize(self.state_data, 'counties') expected = [] for rec in self.state_data: expected.extend(rec['counties']) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) result = json_normalize(self.state_data, 'counties', meta='state') expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) tm.assert_frame_equal(result, expected)