Exemple #1
0
        def download_wet_file(path, dl_dir):
            url = f"{_DOWNLOAD_HOST}/{path}"
            out_path = f"{dl_dir}/{path}"

            if tf.io.gfile.exists(out_path):
                c4_utils.get_counter_inc_fn("download_wet_url")("exists")
                return out_path

            tmp_dir = f"{out_path}.incomplete{uuid.uuid4().hex}"
            try:
                tf.io.gfile.makedirs(tmp_dir)
                downloader = tfds.download.downloader.get_downloader()
                with downloader.tqdm():
                    # TODO(slebedev): Investigate why pytype infers Promise[Future[...]].
                    dl_path = downloader.download(
                        url, tmp_dir).get().path  # type: ignore
                tf.io.gfile.rename(os.fspath(dl_path),
                                   out_path,
                                   overwrite=True)
            finally:
                if tf.io.gfile.exists(tmp_dir):
                    tf.io.gfile.rmtree(tmp_dir)

                c4_utils.get_counter_inc_fn("download_wet_url")("downloaded")
            return out_path
Exemple #2
0
 def _emit_examples(el):
   c4_utils.get_counter_inc_fn(split)("examples")
   _, features = el
   return features["url"], {
       "url": features["url"],
       "text": features["text"],
       "content-type": features["content-type"],
       "content-length": features["content-length"],
       "timestamp": features["timestamp"]
   }
Exemple #3
0
    def download_wet_file(path, dl_dir):
      url = f"{_DOWNLOAD_HOST}/{path}"
      out_path = f"{dl_dir}/{path}"

      if tf.io.gfile.exists(out_path):
        c4_utils.get_counter_inc_fn("download_wet_url")("exists")
        return out_path

      tmp_dir = f"{out_path}.incomplete{uuid.uuid4().hex}"
      try:
        tf.io.gfile.makedirs(tmp_dir)
        downloader = tfds.download.downloader.get_downloader()
        with downloader.tqdm():
          dl_path = downloader.download(url, tmp_dir).get().path
        tf.io.gfile.rename(os.fspath(dl_path), out_path, overwrite=True)
      finally:
        if tf.io.gfile.exists(tmp_dir):
          tf.io.gfile.rmtree(tmp_dir)

        c4_utils.get_counter_inc_fn("download_wet_url")("downloaded")
      return out_path
Exemple #4
0
 def _emit_examples(el):
     c4_utils.get_counter_inc_fn("emit-examples")("emitted")
     url, text = el
     return {"url": url, "text": text}