Ejemplo n.º 1
0
def extract_references_from_wets(wet_files,
                                 metadata_dir,
                                 out_dir,
                                 tmp_dir=None):
    """Extract references from WET files into sharded output files."""
    # Setup output files
    shard_files = make_ref_shard_files(out_dir)
    num_refs = 0
    total = len(wet_files)
    for i, wet_file in enumerate(wet_files):
        num_refs_in_wet = 0
        tf.logging.info(f"Processing file {i} of {total}")

        # Read metadata file
        metadata_fname = os.path.join(
            metadata_dir, os.path.basename(wet_file)) + cc_utils.METADTA_SUFFIX
        with tf.gfile.Open(cc_utils.readahead(metadata_fname)) as f:
            wet_metadata = json.loads(f.read())

        if not wet_metadata:
            # No references in this WET file
            continue

        if wet_file.startswith("http"):
            # download
            if not tmp_dir:
                tmp_dir = tempfile.gettempdir()
            record_gen = cc_utils.wet_records_from_url(wet_file, tmp_dir)
        else:
            # local
            record_gen = cc_utils.wet_records_from_file_obj(
                cc_utils.gzip_memfile(wet_file), take_ownership=True)
        jj = 0
        for wet_record in record_gen:
            jj += 1
            # these are shard ids that contain one or more Wikipedia
            # articles that cite this reference
            shard_ids = wet_metadata.get(wet_record.url)
            if not shard_ids:
                # URL not in dataset
                continue
            # Serialize and write out
            ex = _make_example_from_record(wet_record)
            ex_str = ex.SerializeToString()
            for shard_id in shard_ids:
                shard_files[shard_id].write(ex_str)
                shard_files[shard_id].flush()
            num_refs += 1
            num_refs_in_wet += 1
        tf.logging.info("Wrote out %d references for this WET",
                        num_refs_in_wet)

    tf.logging.info("Wrote out %d references total", num_refs)

    # Cleanup
    for shard_file in shard_files:
        shard_file.close()
Ejemplo n.º 2
0
def extract_references_from_wets(wet_files, metadata_dir, out_dir,
                                 tmp_dir=None):
  """Extract references from WET files into sharded output files."""
  # Setup output files
  shard_files = make_ref_shard_files(out_dir)

  num_refs = 0
  for i, wet_file in enumerate(wet_files):
    num_refs_in_wet = 0
    tf.logging.info("Processing file %d", i)

    # Read metadata file
    metadata_fname = os.path.join(
        metadata_dir, os.path.basename(wet_file)) + cc_utils.METADTA_SUFFIX
    with tf.gfile.Open(cc_utils.readahead(metadata_fname)) as f:
      wet_metadata = json.loads(f.read())

    if not wet_metadata:
      # No references in this WET file
      continue

    if wet_file.startswith("http"):
      # download
      if not tmp_dir:
        tmp_dir = tempfile.gettempdir()
      record_gen = cc_utils.wet_records_from_url(wet_file, tmp_dir)
    else:
      # local
      record_gen = cc_utils.wet_records_from_file_obj(
          cc_utils.gzip_memfile(wet_file), take_ownership=True)

    for wet_record in record_gen:
      shard_ids = wet_metadata.get(wet_record.url)
      if not shard_ids:
        # URL not in dataset
        continue

      # Serialize and write out
      ex = _make_example_from_record(wet_record)
      ex_str = ex.SerializeToString()
      for shard_id in shard_ids:
        shard_files[shard_id].write(ex_str)
      num_refs += 1
      num_refs_in_wet += 1

    tf.logging.info("Wrote out %d references for this WET", num_refs_in_wet)

  tf.logging.info("Wrote out %d references total", num_refs)

  # Cleanup
  for shard_file in shard_files:
    shard_file.close()