def main(_):
    assert FLAGS.out_dir
    assert FLAGS.metadata_dir
    out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id)
    tf.gfile.MakeDirs(out_dir)

    with utils.timing("get_refs_commoncrawl"):
        # Get all WET files
        if FLAGS.commoncrawl_wet_dir:
            wet_files = tf.gfile.Glob(
                os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz"))
        else:
            tmp_dir = tempfile.gettempdir()
            wet_files = list(
                utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"],
                                        tmp_dir))

        # Shard and select this task's work
        wet_files.sort()
        wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id]
        tf.logging.info("Sharded out WET files. Processing %d files",
                        len(wet_files))

        wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir,
                                             out_dir)
def main(_):
    urls = get_urls_for_shard_group(FLAGS.urls_dir, FLAGS.shard_id,
                                    FLAGS.group_id)
    tf.logging.info("Fetching %d URLs for shard %d, group %d", len(urls),
                    FLAGS.shard_id, FLAGS.group_id)

    tf.gfile.MakeDirs(FLAGS.out_dir)
    out_fname = tfrecord_fname(FLAGS.out_dir, FLAGS.shard_id)

    with utils.timing("group_fetch"):
        logging_fnames = {}
        if FLAGS.log_samples:
            logging_fnames["samples"] = os.path.join(
                FLAGS.out_dir, "samples.%d.txt" % FLAGS.shard_id)
        loop = asyncio.get_event_loop()
        num_written = loop.run_until_complete(
            asyncio.ensure_future(fetch_urls(urls, out_fname, logging_fnames)))

    tf.logging.info("Total URLs: %d", len(urls))
    tf.logging.info("Num written: %d", num_written)
    tf.logging.info("Coverage: %.1f", (num_written / len(urls)) * 100)
def main(_):
  shard_urls = fetch.get_urls_for_shard(FLAGS.urls_dir, FLAGS.shard_id)
  num_groups = int(math.ceil(len(shard_urls) / fetch.URLS_PER_CLIENT))
  tf.logging.info("Launching get_references_web_single_group sequentially for "
                  "%d groups in shard %d. Total URLs: %d",
                  num_groups, FLAGS.shard_id, len(shard_urls))
  command_prefix = FLAGS.command.split() + [
      "--urls_dir=%s" % FLAGS.urls_dir,
      "--shard_id=%d" % FLAGS.shard_id,
      "--debug_num_urls=%d" % FLAGS.debug_num_urls,
  ]
  with utils.timing("all_groups_fetch"):
    for i in range(num_groups):
      command = list(command_prefix)
      out_dir = os.path.join(FLAGS.out_dir, "process_%d" % i)
      command.append("--out_dir=%s" % out_dir)
      command.append("--group_id=%d" % i)
      try:
        # Even on 1 CPU, each group should finish within an hour.
        sp.check_call(command, timeout=60*60)
      except sp.TimeoutExpired:
        tf.logging.error("Group %d timed out", i)
def main(_):
    if FLAGS.for_commoncrawl:
        problem = wikisum.WikisumCommoncrawl()
    else:
        problem = wikisum.WikisumWeb()

    out_filepaths = problem.out_filepaths(FLAGS.out_dir)
    out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id]

    if not FLAGS.vocab_dir:
        FLAGS.vocab_dir = FLAGS.out_dir

    shard_ids = utils.shard(list(range(utils.NUM_SHARDS)),
                            FLAGS.num_tasks)[FLAGS.task_id]

    with utils.timing("produce_examples"):
        wikisum.produce_examples(shard_ids=shard_ids,
                                 wikis_dir=FLAGS.wikis_dir,
                                 refs_dir=FLAGS.refs_dir,
                                 urls_dir=FLAGS.urls_dir,
                                 vocab_path=os.path.join(
                                     FLAGS.vocab_dir, problem.vocab_filename),
                                 out_filepaths=out_filepaths)
def main(_):
  urls = get_urls_for_shard_group(
      FLAGS.urls_dir, FLAGS.shard_id, FLAGS.group_id)
  tf.logging.info("Fetching %d URLs for shard %d, group %d",
                  len(urls), FLAGS.shard_id, FLAGS.group_id)

  tf.gfile.MakeDirs(FLAGS.out_dir)
  out_fname = tfrecord_fname(FLAGS.out_dir, FLAGS.shard_id)

  with utils.timing("group_fetch"):
    logging_fnames = {}
    if FLAGS.log_samples:
      logging_fnames["samples"] = os.path.join(
          FLAGS.out_dir, "samples.%d.txt" % FLAGS.shard_id)
    loop = asyncio.get_event_loop()
    num_written = loop.run_until_complete(asyncio.ensure_future(
        fetch_urls(urls,
                   out_fname,
                   logging_fnames)))

  tf.logging.info("Total URLs: %d", len(urls))
  tf.logging.info("Num written: %d", num_written)
  tf.logging.info("Coverage: %.1f", (num_written / len(urls)) * 100)
def main(_):
  assert FLAGS.out_dir
  assert FLAGS.metadata_dir
  out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id)
  tf.gfile.MakeDirs(out_dir)

  with utils.timing("get_refs_commoncrawl"):
    # Get all WET files
    if FLAGS.commoncrawl_wet_dir:
      wet_files = tf.gfile.Glob(
          os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz"))
    else:
      tmp_dir = tempfile.gettempdir()
      wet_files = list(
          utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"], tmp_dir))

    # Shard and select this task's work
    wet_files.sort()
    wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id]
    tf.logging.info("Sharded out WET files. Processing %d files",
                    len(wet_files))

    wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir, out_dir)
def main(_):
    shard_urls = fetch.get_urls_for_shard(FLAGS.urls_dir, FLAGS.shard_id)
    num_groups = int(math.ceil(len(shard_urls) / fetch.URLS_PER_CLIENT))
    tf.logging.info(
        "Launching get_references_web_single_group sequentially for "
        "%d groups in shard %d. Total URLs: %d", num_groups, FLAGS.shard_id,
        len(shard_urls))
    command_prefix = FLAGS.command.split() + [
        "--urls_dir=%s" % FLAGS.urls_dir,
        "--shard_id=%d" % FLAGS.shard_id,
        "--debug_num_urls=%d" % FLAGS.debug_num_urls,
    ]
    with utils.timing("all_groups_fetch"):
        for i in range(num_groups):
            command = list(command_prefix)
            out_dir = os.path.join(FLAGS.out_dir, "process_%d" % i)
            command.append("--out_dir=%s" % out_dir)
            command.append("--group_id=%d" % i)
            try:
                # Even on 1 CPU, each group should finish within an hour.
                sp.check_call(command, timeout=60 * 60)
            except sp.TimeoutExpired:
                tf.logging.error("Group %d timed out", i)
def main(_):
  if FLAGS.for_commoncrawl:
    problem = wikisum.WikisumCommoncrawl()
  else:
    problem = wikisum.WikisumWeb()

  out_filepaths = problem.out_filepaths(FLAGS.out_dir)
  out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id]

  if not FLAGS.vocab_dir:
    FLAGS.vocab_dir = FLAGS.out_dir

  shard_ids = utils.shard(list(range(utils.NUM_SHARDS)),
                          FLAGS.num_tasks)[FLAGS.task_id]

  with utils.timing("produce_examples"):
    wikisum.produce_examples(
        shard_ids=shard_ids,
        wikis_dir=FLAGS.wikis_dir,
        refs_dir=FLAGS.refs_dir,
        urls_dir=FLAGS.urls_dir,
        vocab_path=os.path.join(FLAGS.vocab_dir, problem.vocab_filename),
        out_filepaths=out_filepaths)