Exemple #1
0
def get_groundtruth(lang):
    """Get actual counts of top articles and their pageviews from a Wikipedia from yesterday."""
    p = PageviewsClient(
        user_agent="[email protected] -- diff private toolforge")
    try:
        groundtruth = p.top_articles(
            project='{0}.wikipedia'.format(lang),
            access='all-access',
            year=None,
            month=None,
            day=None,  # defaults to yesterday
            limit=50)
    except Exception:
        two_days_ago = date.today() - timedelta(days=2)
        groundtruth = p.top_articles(project='{0}.wikipedia'.format(lang),
                                     access='all-access',
                                     year=two_days_ago.year,
                                     month=two_days_ago.month,
                                     day=two_days_ago.day,
                                     limit=50)
    return {
        r['article']: {
            'gt-rank': r['rank'],
            'gt-views': r['views']
        }
        for r in groundtruth
    }
class WikiIngest(object):
    def __init__(self):
        self.db_connection = DBConnection()
        self.logger = logging.getLogger(__name__)
        self.api = PageviewsClient(
            "Mozilla/5.0 (X11; Linux x86_64)"
            " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        )

    def get_top_articles(self, time_collect=None, historic=False):
        if not historic:
            time_collect = datetime.now() - timedelta(days=1)

        results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA,
                                        year=time_collect.year,
                                        month=time_collect.month,
                                        day=time_collect.day)

        timestamp = calendar.timegm(time_collect.timetuple())
        articles_to_insert = []
        bulk_op = None
        if historic:
            bulk_op = self.db_connection.start_bulk_upsert(
                collection=DB.WIKI_TRENDS)
        for result in results:
            name = result["article"]
            if "_" in name:
                name = name.replace("_", " ")

            doc = {
                WIKI_TREND.NAME: name,
                WIKI_TREND.RANK: int(result["rank"]),
                WIKI_TREND.VIEWS: int(result["views"]),
                WIKI_TREND.TIMESTAMP: timestamp,
                WIKI_TREND.DATE_OBJECT: time_collect,
                WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"),
                WIKI_TREND.MONTH: time_collect.strftime("%B").lower(),
                WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(),
                WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")),
                WIKI_TREND.YEAR: time_collect.strftime("%Y")
            }

            if historic:
                self.db_connection.add_to_bulk_upsert(query={
                    "$and": [{
                        WIKI_TREND.NAME: name
                    }, {
                        WIKI_TREND.DATE_STRING:
                        time_collect.strftime("%A %B %d %Y")
                    }]
                },
                                                      data=doc,
                                                      bulk_op=bulk_op)

            else:
                articles_to_insert.append(doc)

        if historic:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

        else:
            self.db_connection.bulk_insert(data=articles_to_insert,
                                           collection=DB.WIKI_TRENDS)
Exemple #3
0
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000):

    """
    Download pageviews from Wikipedia

    :param entities: A list of entities (Wikipedia pages) to get pageview data for
    :param start: The start date of the range over which to collect data;
        2015-07-01 is the earliest supported by the API
    :param end: The end date of the range, or None for today
    :param access: The method by which Wikipedia was accessed (default: desktop)
    :param agent: The user agent accessing Wikipedia (default: user)
    :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call
    :return: A DataFrame of entities x pageviews by day
    """
    
    if end is None:
        end = datetime.date.today().strftime('%Y%m%d')
    
    p = PageviewsClient()
    dates = pd.date_range(start=start, end=end)

    #str -> list
    if type(entities) is str:
        
        entities = [entities]
    
    # if entities aren't passed in, get the daily top entities for the period
    if entities is None:
        df_pvs = None
    
        for d in dates:
            try:
                df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\
                                                 day=d.day, limit=limit, access=access))
            except:
                continue

            df = df.set_index('article').rename(columns={'views': d})[[d]]

            if df_pvs is None:
                df_pvs = df
            else:
                df_pvs = df_pvs.join(df, how='outer')

        entities = df_pvs.index.values.tolist()
    
    for i in range(len(entities)):
        try:
            entities[i] = unidecode(wikipedia.page(entities[i]).title)
        except wikipedia.exceptions.DisambiguationError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        except wikipedia.exceptions.PageError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        
    search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent)
    df = pd.DataFrame.from_dict(search, orient='index')
    
    return df