def get_top_languages(self):
     """Return top Wikipedia languages by article count.
     Returns:
         set<string>: e.g. {'en', 'de', 'sv', ...}
     """
     topsites = csv_to_list(self.spark, self.topsites_file)
     languages = {x[0] for x in topsites}
     ll = len(languages)
     if ll != TOP_LANGUAGES_COUNT:
         log('We got %d top languages, and not %d.'
             % (ll, TOP_LANGUAGES_COUNT), 'warning')
     return languages
 def calculate_pageviews(self, wikidata, languages):
     """Calculate pageviews, normalized ranks, and log ranks.
     Args:
         wikidata (dataframe)
         languages (list of strings)
     Returns:
         dataframe
     """
     pageviews = self.get_pageviews_from_hive(languages)
     pageviews = pageviews\
         .alias('p')\
         .join(
             wikidata.alias('w'),
             (F.col('p.page_title') == F.col('w.title')) &
             (F.col('p.wiki') == F.col('w.site')),
         )\
         .select(F.col('w.id').alias('id'),
                 F.col('p.wiki').alias('wiki'),
                 F.col('p.view_count').alias('view_count'))
     window = Window.partitionBy('wiki').orderBy('view_count')
     pageviews = pageviews.select(
         F.col('*'),
         F.rank().over(window).alias('rank')
     )
     wikis = ['%swiki' % x for x in languages]
     pageviews = pageviews\
         .groupBy('id')\
         .pivot('wiki', wikis)\
         .agg(F.first('view_count'), F.first('rank'))\
         .fillna(0)
     for language in languages:
         pageviews = pageviews\
             .withColumnRenamed(
                 '%swiki_first(view_count, false)' % language,
                 '%s_view_count' % language
             )\
             .withColumnRenamed('%swiki_first(rank, false)' % language,
                                '%s_rank' % language)
         article_count = pageviews\
             .where(F.col('%s_view_count' % language) != 0)\
             .count()
         pageviews = pageviews\
             .withColumn(
                 '%s_normalized_rank' % language,
                 F.col('%s_rank' % language) / article_count
             )\
             .withColumn('%s_log_rank' % language,
                         F.log('%s_rank' % language))\
             .fillna(0)
     log('Calculated pageviews.', 'debug')
     return pageviews
Exemple #3
0
def main():
    """Main entry point of the script.
    Parses command line options, trains models, and makes predictions.
    """
    options = get_cmd_options()
    spark = get_spark_session(options.spark_app_name)
    if validate_cmd_options(spark, options):
        log('Options: %s' % str(options), 'debug')
        normalized_scores = NormalizedScores(spark, options.language_pairs,
                                             options.end_date,
                                             options.wikidata_dir,
                                             options.topsites_file,
                                             options.output_dir,
                                             options.tmp_dir)
        normalized_scores.train()
 def get_missing_pageviews(self, wikidata, pageviews):
     """Return pageviews for languages that don't have them computed.
     Returns:
         dataframe
     """
     existing_languages = set([
         x.split('_')[0] for x in pageviews.columns
         if x.endswith('_view_count')
     ])
     missing_languages = self.language_pairs_set\
                             .difference(existing_languages)
     log('Pageviews for these languages are missing: %s' %
         missing_languages, 'debug')
     missing_pageviews = None
     if len(missing_languages):
         missing_pageviews = self.calculate_pageviews(
             wikidata, missing_languages)
     return missing_pageviews
 def get_pageviews(self, wikidata):
     """Return page view counts, normalized and log ranks.
     Return from file if exists, otherwise generate the file, save
     it, and return it.
     Args:
         wikidata (dataframe)
     Returns:
         dataframe: pageviews, normalized ranks, and log ranks.
     """
     filename = '%s/article-pageviews-%s-%s' %\
         (self.tmp_dir, self.start_date, self.end_date)
     try:
         pageviews = self.spark.read.parquet(filename)
         # If we're training language pairs that were not trained
         # before, we'll need to generate missing pageviews and save
         # them.
         missing_pageviews = self.get_missing_pageviews(wikidata, pageviews)
         if missing_pageviews:
             pageviews = pageviews\
                 .alias('p')\
                 .join(missing_pageviews.alias('m'),
                       F.col('p.id') == F.col('m.id'))\
                 .drop(F.col('m.id'))
             # TODO: think about saving additions only.
             pageviews.write.parquet(filename)
         log('Returning existing pageviews from %s.'
             % filename, 'debug')
     except Exception:
         log('Starting to calculate pageviews and save as %s.'
             % filename, 'debug')
         pageviews = self.calculate_pageviews(
             wikidata,
             self.top_languages.union(self.language_pairs_set)
         )
         pageviews.write.parquet(filename)
     return pageviews
Exemple #6
0
def validate_cmd_options(spark, options):
    """Validate command line options passed by the user.
    Args:
        spark (SparkSession)
        options (object)
    Returns:
        bool: In case of error, False is returned. Otherwise, True.
    """
    wikipedias = get_wikipedia_dblist(spark, options.dblist_file)
    for source, target in options.language_pairs:
        if '%swiki' % source not in wikipedias:
            log('Unrecognized source language: %s' % source, 'error')
            return False
        if '%swiki' % target not in wikipedias:
            log('Unrecognized target language: %s' % target, 'error')
            return False
        if source == target:
            log('Source and target languages cannot be the same.', 'error')
            return False
    if options.end_date > datetime.today().date():
        log('End date cannot be later than today: %s.' % options.end_date,
            'error')
        return False
    return True