def _convert_xgboost_remote(in_path: str, out_path: str) -> None: with as_local_path(in_path) as local_input, \ as_output_file(out_path, 'wb', overwrite=True) as local_output: _convert_xgboost_local(local_input, local_output.name)
def make_folds(sc, sqlContext, input_dir, output_dir, wikis, zero_features, num_folds, num_workers, max_executors): hdfs_mkdir(output_dir) df = sqlContext.read.parquet(input_dir) \ .select('wikiid', 'query', 'features', 'label', 'norm_query_id') if wikis: df = df.where(F.col('wikiid').isin(wikis)) counts = df.groupBy('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect() counts = {row.wikiid: row.n_obs for row in counts} if not wikis: wikis = counts.keys() else: missing = set(wikis).difference(counts.keys()) for wiki in missing: print 'No observations available for ' + wiki wikis = list(set(wikis).intersection(counts.keys())) if not wikis: raise Exception('No wikis provided') # sort to descending size, so mapping over them does the largest first wikis.sort(reverse=True, key=lambda wiki: counts[wiki]) if zero_features: df = mjolnir.feature_engineering.zero_features(df, zero_features) if max_executors is None: max_executors = num_workers # TODO: Limit size? pool = multiprocessing.dummy.Pool(len(wikis) * 3) df_fold = ( mjolnir.training.tuning.group_k_fold(df, num_folds) .repartition(200, 'wikiid', 'query') .sortWithinPartitions('wikiid', 'query', F.col('label').asc())) try: df_fold.cache() df_fold.count() wiki_stats = {} for wiki in wikis: df_wiki = df_fold.where(F.col('wikiid') == wiki).drop('wikiid') path_format = os.path.join(output_dir, wiki + '.%s.f%s.p%d') metadata = dict(df.schema['features'].metadata) if 'wiki_features' in metadata: metadata['features'] = metadata['wiki_features'][wiki] del metadata['wiki_features'] wiki_stats[wiki] = { 'all': pool.apply_async( write_wiki_all, (sc, df_wiki, num_workers, None, path_format, metadata['features'])), 'folds': pool.apply_async( write_wiki_folds, (sc, df_wiki, num_workers, 'fold', path_format, metadata['features'])), 'stats': pool.apply_async(make_df_wiki_stats, (df_wiki, metadata, counts[wiki])), } wiki_stats = {wiki: {k: v.get() for k, v in stats.items()} for wiki, stats in wiki_stats.items()} for wiki in wikis: wiki_stats[wiki]['num_folds'] = num_folds wiki_stats[wiki]['num_workers'] = num_workers finally: df_fold.unpersist() with as_output_file(os.path.join(output_dir, 'stats.json')) as f: f.write(json.dumps({ 'input_dir': input_dir, 'wikis': wiki_stats, }))
def saveBoosterAsHadoopFile(self, path: str): with as_output_file(path) as f: self.saveBoosterAsLocalFile(f.name)