def get_top_languages(self): """Return top Wikipedia languages by article count. Returns: set<string>: e.g. {'en', 'de', 'sv', ...} """ topsites = csv_to_list(self.spark, self.topsites_file) languages = {x[0] for x in topsites} ll = len(languages) if ll != TOP_LANGUAGES_COUNT: log('We got %d top languages, and not %d.' % (ll, TOP_LANGUAGES_COUNT), 'warning') return languages
def calculate_pageviews(self, wikidata, languages): """Calculate pageviews, normalized ranks, and log ranks. Args: wikidata (dataframe) languages (list of strings) Returns: dataframe """ pageviews = self.get_pageviews_from_hive(languages) pageviews = pageviews\ .alias('p')\ .join( wikidata.alias('w'), (F.col('p.page_title') == F.col('w.title')) & (F.col('p.wiki') == F.col('w.site')), )\ .select(F.col('w.id').alias('id'), F.col('p.wiki').alias('wiki'), F.col('p.view_count').alias('view_count')) window = Window.partitionBy('wiki').orderBy('view_count') pageviews = pageviews.select( F.col('*'), F.rank().over(window).alias('rank') ) wikis = ['%swiki' % x for x in languages] pageviews = pageviews\ .groupBy('id')\ .pivot('wiki', wikis)\ .agg(F.first('view_count'), F.first('rank'))\ .fillna(0) for language in languages: pageviews = pageviews\ .withColumnRenamed( '%swiki_first(view_count, false)' % language, '%s_view_count' % language )\ .withColumnRenamed('%swiki_first(rank, false)' % language, '%s_rank' % language) article_count = pageviews\ .where(F.col('%s_view_count' % language) != 0)\ .count() pageviews = pageviews\ .withColumn( '%s_normalized_rank' % language, F.col('%s_rank' % language) / article_count )\ .withColumn('%s_log_rank' % language, F.log('%s_rank' % language))\ .fillna(0) log('Calculated pageviews.', 'debug') return pageviews
def main(): """Main entry point of the script. Parses command line options, trains models, and makes predictions. """ options = get_cmd_options() spark = get_spark_session(options.spark_app_name) if validate_cmd_options(spark, options): log('Options: %s' % str(options), 'debug') normalized_scores = NormalizedScores(spark, options.language_pairs, options.end_date, options.wikidata_dir, options.topsites_file, options.output_dir, options.tmp_dir) normalized_scores.train()
def get_missing_pageviews(self, wikidata, pageviews): """Return pageviews for languages that don't have them computed. Returns: dataframe """ existing_languages = set([ x.split('_')[0] for x in pageviews.columns if x.endswith('_view_count') ]) missing_languages = self.language_pairs_set\ .difference(existing_languages) log('Pageviews for these languages are missing: %s' % missing_languages, 'debug') missing_pageviews = None if len(missing_languages): missing_pageviews = self.calculate_pageviews( wikidata, missing_languages) return missing_pageviews
def get_pageviews(self, wikidata): """Return page view counts, normalized and log ranks. Return from file if exists, otherwise generate the file, save it, and return it. Args: wikidata (dataframe) Returns: dataframe: pageviews, normalized ranks, and log ranks. """ filename = '%s/article-pageviews-%s-%s' %\ (self.tmp_dir, self.start_date, self.end_date) try: pageviews = self.spark.read.parquet(filename) # If we're training language pairs that were not trained # before, we'll need to generate missing pageviews and save # them. missing_pageviews = self.get_missing_pageviews(wikidata, pageviews) if missing_pageviews: pageviews = pageviews\ .alias('p')\ .join(missing_pageviews.alias('m'), F.col('p.id') == F.col('m.id'))\ .drop(F.col('m.id')) # TODO: think about saving additions only. pageviews.write.parquet(filename) log('Returning existing pageviews from %s.' % filename, 'debug') except Exception: log('Starting to calculate pageviews and save as %s.' % filename, 'debug') pageviews = self.calculate_pageviews( wikidata, self.top_languages.union(self.language_pairs_set) ) pageviews.write.parquet(filename) return pageviews
def validate_cmd_options(spark, options): """Validate command line options passed by the user. Args: spark (SparkSession) options (object) Returns: bool: In case of error, False is returned. Otherwise, True. """ wikipedias = get_wikipedia_dblist(spark, options.dblist_file) for source, target in options.language_pairs: if '%swiki' % source not in wikipedias: log('Unrecognized source language: %s' % source, 'error') return False if '%swiki' % target not in wikipedias: log('Unrecognized target language: %s' % target, 'error') return False if source == target: log('Source and target languages cannot be the same.', 'error') return False if options.end_date > datetime.today().date(): log('End date cannot be later than today: %s.' % options.end_date, 'error') return False return True