def dataset():
    """Returns partial dataset with pagecounts."""
    df = features.pageids_canonical().set_index('pageid')
    np.testing.assert_(df.index.is_unique)
    n_records = len(df)

    # merge titles
    titles = features.titles()
    df = pd.merge(df, titles, left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))

    # merge redirects
    redirects = _redirects_of_interest(df.title)
    df = pd.merge(df, redirects, how='left', left_on='title', right_index=True)
    df['redirect_set'] = df['redirect_set'].fillna(set())
    df['n_redirect'] = df['redirect_set'].apply(len)
    np.testing.assert_equal(n_records, len(df))
    del titles
    del redirects

    # gather pagecounts for all titles associated with a page
    # this code is awkward because we need to avoid making many lookups in a
    # very large HDF5 file. We make just one query and then rearrange results.
    titles_and_redirect_titles_of_interest = set(df.title.tolist())
    for s in df.redirect_set:
        titles_and_redirect_titles_of_interest.update(s)

    # to avoid memory errors we do this in stages
    def chunks(iterable, size):
        it = iter(iterable)
        item = list(itertools.islice(it, size))
        while item:
            yield item
            item = list(itertools.islice(it, size))

    chunk_size = 30
    titles_chunks = chunks(titles_and_redirect_titles_of_interest, chunk_size)
    pagecounts = dict()
    logger.info("Retrieving pagecounts. This may take several hours.")
    for titles in titles_chunks:
        pagecounts_partial = pagecounts_for_titles(titles)
        pagecounts.update(pagecounts_partial)
        if len(pagecounts) % 10000 <= chunk_size:
            logging.info("Retrieving pagecount {} of {}".format(len(pagecounts), len(titles_and_redirect_titles_of_interest)))

    pagecounts_ordered = []
    for i, (index, row) in enumerate(df.iterrows()):
        count = pagecounts.get(row['title'], 0)
        for redirect_title in row['redirect_set']:
            count += pagecounts.get(redirect_title, 0)
        pagecounts_ordered.append(count)
    df['pagecount'] = pagecounts_ordered
    return df[['title', 'pagecount']]
def _redirects_of_interest(titles_of_interest):
    titles = features.titles()
    redirects = features.redirects()
    # restrict redirects to universe and add pageid
    redirects = redirects[redirects.redirect_to.isin(titles_of_interest)].reset_index(drop=True)
    redirects = pd.merge(redirects, titles, how='left', left_on='redirect_from', right_index=True)
    del redirects['redirect_from']  # old redirect_from, was pageid
    redirects.rename(columns={'title': 'redirect_from'}, inplace=True)
    redirects = redirects.set_index('redirect_to')

    # remove NaN; these correspond to pages that have no title in the dump (likely deleted?)
    redirects = redirects[pd.notnull(redirects.redirect_from)]

    # now the redirects DataFrame is in a usable form
    redirects = redirects.groupby(redirects.index)['redirect_from'].apply(set)
    redirects.name = 'redirect_set'
    return pd.DataFrame(redirects)
Beispiel #3
0
def dataset(langlink_max_features, death_decade_start, death_decade_stop):
    """Returns the full dataset

    Limit language link indicators to `langlink_max_features`. Create death
    decade indicators for decades between `death_decade_start` and
    `death_decade_stop`.

    """
    df = features.pageids_canonical().set_index('pageid')
    np.testing.assert_(df.index.is_unique)
    np.testing.assert_(df.notnull().values.all())

    n_records = len(df)

    # merge titles
    titles = features.titles()
    df = pd.merge(df, titles, left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge redirects
    redirects = features.redirects_of_interest()
    df = pd.merge(df, redirects, how='left', left_on='title', right_index=True)
    df['redirect_set'] = df['redirect_set'].fillna(set())
    df['n_redirect'] = df['redirect_set'].apply(len)
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())
    del titles
    del redirects

    # merge topics
    df = pd.merge(df, features.topics(), left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge pagecounts
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.pagecounts(), left_index=True, right_index=True, how='left')
    # because the only column with NaN values is pagecounts, we may fill with 0 safely
    # Note that a pagecount of NaN means a page got 0 or 1 hits during the period studied
    df = df.fillna(0)
    np.testing.assert_((df.pagecount >= 0).all())
    np.testing.assert_equal(n_records, len(df))

    # merge langlinks counts
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.langlink_counts(), left_index=True, right_index=True, how='left')
    # because the only columns with NaN values are dummies, we may fill with 0 safely
    df = df.fillna(0)
    np.testing.assert_(df.notnull().values.all())
    np.testing.assert_equal(n_records, len(df))

    # merge imagelinks
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.imagelink_counts(), left_index=True, right_index=True, how='left')
    df = df.fillna(0)
    np.testing.assert_(df.notnull().values.all())
    np.testing.assert_equal(n_records, len(df))

    # merge article lengths
    df = pd.merge(df, features.article_lengths(df.index), left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge last revision timestamp
    df = pd.merge(df, features.last_revision(df.index), left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))

    # merge article age
    df = pd.merge(df, features.article_age(df.index), left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))

    # merge revisions per day rates
    df = pd.merge(df, features.revisions_per_day(df.index), left_index=True, right_index=True)
    np.testing.assert_equal(n_records, len(df))

    # merge categories (comma-separated list of categories)
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.categories_flat(), left_index=True, right_index=True, how='left')
    # because the only columns with NaN values are dummies, we may fillna safely
    df = df.fillna(set())
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge death decade dummies
    df = pd.merge(df, features.death_decade_dummies(death_decade_start, death_decade_stop), left_index=True, right_index=True, how='left')
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge category dummies of interest
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.category_dummies_of_interest(), left_index=True, right_index=True, how='left')
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge langlink dummies
    np.testing.assert_(df.notnull().values.all())
    df = pd.merge(df, features.langlink_dummies(max_features=langlink_max_features), left_index=True, right_index=True, how='left')
    # because the only columns with NaN values are dummies, we may fillna safely
    df = df.fillna(0)
    np.testing.assert_equal(n_records, len(df))
    np.testing.assert_(df.notnull().values.all())

    # merge online books page data
    df = pd.merge(df, features.digital_editions(), left_index=True, right_index=True, how='left')
    df.obp_digital_editions = df.obp_digital_editions.fillna(0)
    np.testing.assert_equal(n_records, len(df))

    # merge death year (there will be many NaNs)
    df = pd.merge(df, features.death_year(), left_index=True, right_index=True, how='left')
    np.testing.assert_equal(n_records, len(df))

    return df