def get_groundtruth(lang): """Get actual counts of top articles and their pageviews from a Wikipedia from yesterday.""" p = PageviewsClient( user_agent="[email protected] -- diff private toolforge") try: groundtruth = p.top_articles( project='{0}.wikipedia'.format(lang), access='all-access', year=None, month=None, day=None, # defaults to yesterday limit=50) except Exception: two_days_ago = date.today() - timedelta(days=2) groundtruth = p.top_articles(project='{0}.wikipedia'.format(lang), access='all-access', year=two_days_ago.year, month=two_days_ago.month, day=two_days_ago.day, limit=50) return { r['article']: { 'gt-rank': r['rank'], 'gt-views': r['views'] } for r in groundtruth }
class WikiIngest(object): def __init__(self): self.db_connection = DBConnection() self.logger = logging.getLogger(__name__) self.api = PageviewsClient( "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" ) def get_top_articles(self, time_collect=None, historic=False): if not historic: time_collect = datetime.now() - timedelta(days=1) results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA, year=time_collect.year, month=time_collect.month, day=time_collect.day) timestamp = calendar.timegm(time_collect.timetuple()) articles_to_insert = [] bulk_op = None if historic: bulk_op = self.db_connection.start_bulk_upsert( collection=DB.WIKI_TRENDS) for result in results: name = result["article"] if "_" in name: name = name.replace("_", " ") doc = { WIKI_TREND.NAME: name, WIKI_TREND.RANK: int(result["rank"]), WIKI_TREND.VIEWS: int(result["views"]), WIKI_TREND.TIMESTAMP: timestamp, WIKI_TREND.DATE_OBJECT: time_collect, WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"), WIKI_TREND.MONTH: time_collect.strftime("%B").lower(), WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(), WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")), WIKI_TREND.YEAR: time_collect.strftime("%Y") } if historic: self.db_connection.add_to_bulk_upsert(query={ "$and": [{ WIKI_TREND.NAME: name }, { WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y") }] }, data=doc, bulk_op=bulk_op) else: articles_to_insert.append(doc) if historic: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) else: self.db_connection.bulk_insert(data=articles_to_insert, collection=DB.WIKI_TRENDS)
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000): """ Download pageviews from Wikipedia :param entities: A list of entities (Wikipedia pages) to get pageview data for :param start: The start date of the range over which to collect data; 2015-07-01 is the earliest supported by the API :param end: The end date of the range, or None for today :param access: The method by which Wikipedia was accessed (default: desktop) :param agent: The user agent accessing Wikipedia (default: user) :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call :return: A DataFrame of entities x pageviews by day """ if end is None: end = datetime.date.today().strftime('%Y%m%d') p = PageviewsClient() dates = pd.date_range(start=start, end=end) #str -> list if type(entities) is str: entities = [entities] # if entities aren't passed in, get the daily top entities for the period if entities is None: df_pvs = None for d in dates: try: df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\ day=d.day, limit=limit, access=access)) except: continue df = df.set_index('article').rename(columns={'views': d})[[d]] if df_pvs is None: df_pvs = df else: df_pvs = df_pvs.join(df, how='outer') entities = df_pvs.index.values.tolist() for i in range(len(entities)): try: entities[i] = unidecode(wikipedia.page(entities[i]).title) except wikipedia.exceptions.DisambiguationError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere except wikipedia.exceptions.PageError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent) df = pd.DataFrame.from_dict(search, orient='index') return df