def main(_): assert FLAGS.out_dir assert FLAGS.metadata_dir out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id) tf.gfile.MakeDirs(out_dir) with utils.timing("get_refs_commoncrawl"): # Get all WET files if FLAGS.commoncrawl_wet_dir: wet_files = tf.gfile.Glob( os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz")) else: tmp_dir = tempfile.gettempdir() wet_files = list( utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"], tmp_dir)) # Shard and select this task's work wet_files.sort() wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id] tf.logging.info("Sharded out WET files. Processing %d files", len(wet_files)) wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir, out_dir)
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() out_filepaths = problem.out_filepaths(FLAGS.out_dir) out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id] if not FLAGS.vocab_dir: FLAGS.vocab_dir = FLAGS.out_dir shard_ids = utils.shard(list(range(utils.NUM_SHARDS)), FLAGS.num_tasks)[FLAGS.task_id] with utils.timing("produce_examples"): wikisum.produce_examples(shard_ids=shard_ids, wikis_dir=FLAGS.wikis_dir, refs_dir=FLAGS.refs_dir, urls_dir=FLAGS.urls_dir, vocab_path=os.path.join( FLAGS.vocab_dir, problem.vocab_filename), out_filepaths=out_filepaths)
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() out_filepaths = problem.out_filepaths(FLAGS.out_dir) out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id] if not FLAGS.vocab_dir: FLAGS.vocab_dir = FLAGS.out_dir shard_ids = utils.shard(list(range(utils.NUM_SHARDS)), FLAGS.num_tasks)[FLAGS.task_id] with utils.timing("produce_examples"): wikisum.produce_examples( shard_ids=shard_ids, wikis_dir=FLAGS.wikis_dir, refs_dir=FLAGS.refs_dir, urls_dir=FLAGS.urls_dir, vocab_path=os.path.join(FLAGS.vocab_dir, problem.vocab_filename), out_filepaths=out_filepaths)