Example #1
0
    def generateGraphData(self):
        safePrint('Generating and uploading data files')

        allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True)
        xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00']

        # filter type==DATA and site==wikipedia
        allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')]

        # By "iszero+via", e.g.  a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera
        data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum))
        data.reset_index(inplace=True)
        data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1)
        data.drop('iszero', axis=1, inplace=True)
        self.createClippedData('RawData:YearDailyViaIsZero', data)
        self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly)
        self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly)

        allowedSubdomains = ['m', 'zero']
        data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))]
        data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        self.createClippedData('RawData:YearDailySubdomains', data)
        self.createPeriodData('RawData:WeeklySubdomains', data, weekly)
        self.createPeriodData('RawData:MonthlySubdomains', data, monthly)

        # create an artificial yes/no/opera sums
        opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')]
        opera['str'] = 'o'
        yes = allData[allData.iszero == 'y']
        yes['str'] = 'y'
        no = allData[allData.iszero == 'n']
        no['str'] = 'n'
        combined = opera.append(yes).append(no)
        data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        headerFields = 'date,xcs,iszero,count'  # Override "str" as "iszero"
        self.createClippedData('RawData:YearDailyTotals', data, headerFields)
        self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields)

        data = []
        for xcsId in list(allData.xcs.unique()):
            byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \
                .order('count', ascending=False)
            top = byLang.head(5)
            vals = list(top.iteritems())
            vals.append(('other', byLang.sum() - top.sum()))
            valsTotal = sum([v[1] for v in vals]) / 100.0
            data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals])

        self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
Example #2
0
    def test_record_prefix(self, state_data):
        result = json_normalize(state_data[0], 'counties')
        expected = DataFrame(state_data[0]['counties'])
        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties',
                                meta='state',
                                record_prefix='county_')

        expected = []
        for rec in state_data:
            expected.extend(rec['counties'])
        expected = DataFrame(expected)
        expected = expected.rename(columns=lambda x: 'county_' + x)
        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])

        tm.assert_frame_equal(result, expected)
Example #3
0
    def test_simple_normalize(self, state_data):
        result = json_normalize(state_data[0], 'counties')
        expected = DataFrame(state_data[0]['counties'])
        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties')

        expected = []
        for rec in state_data:
            expected.extend(rec['counties'])
        expected = DataFrame(expected)

        tm.assert_frame_equal(result, expected)

        result = json_normalize(state_data, 'counties', meta='state')
        expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])

        tm.assert_frame_equal(result, expected)