def dataset(): """Returns partial dataset with pagecounts.""" df = features.pageids_canonical().set_index('pageid') np.testing.assert_(df.index.is_unique) n_records = len(df) # merge titles titles = features.titles() df = pd.merge(df, titles, left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) # merge redirects redirects = _redirects_of_interest(df.title) df = pd.merge(df, redirects, how='left', left_on='title', right_index=True) df['redirect_set'] = df['redirect_set'].fillna(set()) df['n_redirect'] = df['redirect_set'].apply(len) np.testing.assert_equal(n_records, len(df)) del titles del redirects # gather pagecounts for all titles associated with a page # this code is awkward because we need to avoid making many lookups in a # very large HDF5 file. We make just one query and then rearrange results. titles_and_redirect_titles_of_interest = set(df.title.tolist()) for s in df.redirect_set: titles_and_redirect_titles_of_interest.update(s) # to avoid memory errors we do this in stages def chunks(iterable, size): it = iter(iterable) item = list(itertools.islice(it, size)) while item: yield item item = list(itertools.islice(it, size)) chunk_size = 30 titles_chunks = chunks(titles_and_redirect_titles_of_interest, chunk_size) pagecounts = dict() logger.info("Retrieving pagecounts. This may take several hours.") for titles in titles_chunks: pagecounts_partial = pagecounts_for_titles(titles) pagecounts.update(pagecounts_partial) if len(pagecounts) % 10000 <= chunk_size: logging.info("Retrieving pagecount {} of {}".format(len(pagecounts), len(titles_and_redirect_titles_of_interest))) pagecounts_ordered = [] for i, (index, row) in enumerate(df.iterrows()): count = pagecounts.get(row['title'], 0) for redirect_title in row['redirect_set']: count += pagecounts.get(redirect_title, 0) pagecounts_ordered.append(count) df['pagecount'] = pagecounts_ordered return df[['title', 'pagecount']]
def _redirects_of_interest(titles_of_interest): titles = features.titles() redirects = features.redirects() # restrict redirects to universe and add pageid redirects = redirects[redirects.redirect_to.isin(titles_of_interest)].reset_index(drop=True) redirects = pd.merge(redirects, titles, how='left', left_on='redirect_from', right_index=True) del redirects['redirect_from'] # old redirect_from, was pageid redirects.rename(columns={'title': 'redirect_from'}, inplace=True) redirects = redirects.set_index('redirect_to') # remove NaN; these correspond to pages that have no title in the dump (likely deleted?) redirects = redirects[pd.notnull(redirects.redirect_from)] # now the redirects DataFrame is in a usable form redirects = redirects.groupby(redirects.index)['redirect_from'].apply(set) redirects.name = 'redirect_set' return pd.DataFrame(redirects)
def dataset(langlink_max_features, death_decade_start, death_decade_stop): """Returns the full dataset Limit language link indicators to `langlink_max_features`. Create death decade indicators for decades between `death_decade_start` and `death_decade_stop`. """ df = features.pageids_canonical().set_index('pageid') np.testing.assert_(df.index.is_unique) np.testing.assert_(df.notnull().values.all()) n_records = len(df) # merge titles titles = features.titles() df = pd.merge(df, titles, left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge redirects redirects = features.redirects_of_interest() df = pd.merge(df, redirects, how='left', left_on='title', right_index=True) df['redirect_set'] = df['redirect_set'].fillna(set()) df['n_redirect'] = df['redirect_set'].apply(len) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) del titles del redirects # merge topics df = pd.merge(df, features.topics(), left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge pagecounts np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.pagecounts(), left_index=True, right_index=True, how='left') # because the only column with NaN values is pagecounts, we may fill with 0 safely # Note that a pagecount of NaN means a page got 0 or 1 hits during the period studied df = df.fillna(0) np.testing.assert_((df.pagecount >= 0).all()) np.testing.assert_equal(n_records, len(df)) # merge langlinks counts np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.langlink_counts(), left_index=True, right_index=True, how='left') # because the only columns with NaN values are dummies, we may fill with 0 safely df = df.fillna(0) np.testing.assert_(df.notnull().values.all()) np.testing.assert_equal(n_records, len(df)) # merge imagelinks np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.imagelink_counts(), left_index=True, right_index=True, how='left') df = df.fillna(0) np.testing.assert_(df.notnull().values.all()) np.testing.assert_equal(n_records, len(df)) # merge article lengths df = pd.merge(df, features.article_lengths(df.index), left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge last revision timestamp df = pd.merge(df, features.last_revision(df.index), left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) # merge article age df = pd.merge(df, features.article_age(df.index), left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) # merge revisions per day rates df = pd.merge(df, features.revisions_per_day(df.index), left_index=True, right_index=True) np.testing.assert_equal(n_records, len(df)) # merge categories (comma-separated list of categories) np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.categories_flat(), left_index=True, right_index=True, how='left') # because the only columns with NaN values are dummies, we may fillna safely df = df.fillna(set()) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge death decade dummies df = pd.merge(df, features.death_decade_dummies(death_decade_start, death_decade_stop), left_index=True, right_index=True, how='left') np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge category dummies of interest np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.category_dummies_of_interest(), left_index=True, right_index=True, how='left') np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge langlink dummies np.testing.assert_(df.notnull().values.all()) df = pd.merge(df, features.langlink_dummies(max_features=langlink_max_features), left_index=True, right_index=True, how='left') # because the only columns with NaN values are dummies, we may fillna safely df = df.fillna(0) np.testing.assert_equal(n_records, len(df)) np.testing.assert_(df.notnull().values.all()) # merge online books page data df = pd.merge(df, features.digital_editions(), left_index=True, right_index=True, how='left') df.obp_digital_editions = df.obp_digital_editions.fillna(0) np.testing.assert_equal(n_records, len(df)) # merge death year (there will be many NaNs) df = pd.merge(df, features.death_year(), left_index=True, right_index=True, how='left') np.testing.assert_equal(n_records, len(df)) return df