Beispiel #1
0
def _convert_xgboost_remote(in_path: str, out_path: str) -> None:
    with as_local_path(in_path) as local_input, \
            as_output_file(out_path, 'wb', overwrite=True) as local_output:
        _convert_xgboost_local(local_input, local_output.name)
Beispiel #2
0
def make_folds(sc, sqlContext, input_dir, output_dir, wikis, zero_features, num_folds, num_workers, max_executors):
    hdfs_mkdir(output_dir)
    df = sqlContext.read.parquet(input_dir) \
        .select('wikiid', 'query', 'features', 'label', 'norm_query_id')
    if wikis:
        df = df.where(F.col('wikiid').isin(wikis))

    counts = df.groupBy('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect()
    counts = {row.wikiid: row.n_obs for row in counts}

    if not wikis:
        wikis = counts.keys()
    else:
        missing = set(wikis).difference(counts.keys())
        for wiki in missing:
            print 'No observations available for ' + wiki
        wikis = list(set(wikis).intersection(counts.keys()))
    if not wikis:
        raise Exception('No wikis provided')

    # sort to descending size, so mapping over them does the largest first
    wikis.sort(reverse=True, key=lambda wiki: counts[wiki])

    if zero_features:
        df = mjolnir.feature_engineering.zero_features(df, zero_features)
    if max_executors is None:
        max_executors = num_workers

    # TODO: Limit size?
    pool = multiprocessing.dummy.Pool(len(wikis) * 3)

    df_fold = (
        mjolnir.training.tuning.group_k_fold(df, num_folds)
        .repartition(200, 'wikiid', 'query')
        .sortWithinPartitions('wikiid', 'query', F.col('label').asc()))

    try:
        df_fold.cache()
        df_fold.count()

        wiki_stats = {}
        for wiki in wikis:
            df_wiki = df_fold.where(F.col('wikiid') == wiki).drop('wikiid')
            path_format = os.path.join(output_dir, wiki + '.%s.f%s.p%d')
            metadata = dict(df.schema['features'].metadata)
            if 'wiki_features' in metadata:
                metadata['features'] = metadata['wiki_features'][wiki]
                del metadata['wiki_features']
            wiki_stats[wiki] = {
                'all': pool.apply_async(
                    write_wiki_all,
                    (sc, df_wiki, num_workers, None, path_format, metadata['features'])),
                'folds': pool.apply_async(
                    write_wiki_folds,
                    (sc, df_wiki, num_workers, 'fold', path_format, metadata['features'])),
                'stats': pool.apply_async(make_df_wiki_stats, (df_wiki, metadata, counts[wiki])),
            }

        wiki_stats = {wiki: {k: v.get() for k, v in stats.items()} for wiki, stats in wiki_stats.items()}
        for wiki in wikis:
            wiki_stats[wiki]['num_folds'] = num_folds
            wiki_stats[wiki]['num_workers'] = num_workers
    finally:
        df_fold.unpersist()

    with as_output_file(os.path.join(output_dir, 'stats.json')) as f:
        f.write(json.dumps({
            'input_dir': input_dir,
            'wikis': wiki_stats,
        }))
Beispiel #3
0
 def saveBoosterAsHadoopFile(self, path: str):
     with as_output_file(path) as f:
         self.saveBoosterAsLocalFile(f.name)