Exemple #1
0
  def maybe_prepare_text(self, tmp_dir):
    """Download corpus if necessary, decompress, split into multiple text files.

    Args:
      tmp_dir: directory containing dataset.

    Returns:
      list of filepaths for local text files.
    """
    compressed_filename = os.path.basename(self.corpus_url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    decompressed_filepath = compressed_filepath[:-4]
    split_file_prefix = decompressed_filepath + "-part-"
    split_filepattern = split_file_prefix + "?????"
    split_files = sorted(tf.gfile.Glob(split_filepattern))
    if not split_files:
      if not tf.gfile.Exists(decompressed_filepath):
        if not tf.gfile.Exists(compressed_filepath):
          generator_utils.maybe_download(
              tmp_dir, compressed_filepath, self.corpus_url)
        assert not subprocess.call(["bunzip2", compressed_filepath])
      assert tf.gfile.Exists(decompressed_filepath)
      assert not subprocess.call([
          "split", "--line-bytes=4M", "--suffix-length=5",
          "--numeric-suffixes", decompressed_filepath, split_file_prefix])
      split_files = sorted(tf.gfile.Glob(split_filepattern))
    assert split_files
    return split_files
Exemple #2
0
def _get_mnist(directory):
  """Download all MNIST files to directory unless they are there."""
  for filename in [
      _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME,
      _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME
  ]:
    generator_utils.maybe_download(directory, filename, _MNIST_URL + filename)
Exemple #3
0
def _get_fashion_mnist(directory):
  """Download all FashionMNIST files to directory unless they are there."""
  # Fashion mnist files have the same names as MNIST.
  # We must choose a separate name (by adding 'fashion-' prefix) in the tmp_dir.
  for filename in [
      _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME,
      _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME
  ]:
    generator_utils.maybe_download(directory,
                                   _FASHION_MNIST_LOCAL_FILE_PREFIX + filename,
                                   _FASHION_MNIST_URL + filename)
Exemple #4
0
def _compile_data(tmp_dir, datasets, filename):
  """Concatenate all `datasets` and save to `filename`."""
  filename = os.path.join(tmp_dir, filename)
  lang1_lines, lang2_lines = [], []
  for dataset in datasets:
    url = dataset[0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)

    lang1_filename, lang2_filename = dataset[1]
    lang1_filepath = os.path.join(tmp_dir, lang1_filename)
    lang2_filepath = os.path.join(tmp_dir, lang2_filename)

    if not os.path.exists(compressed_filepath):
      generator_utils.maybe_download(tmp_dir, compressed_filename, url)
    if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath):
      mode = "r:gz" if "gz" in compressed_filepath else "r"
      with tarfile.open(compressed_filepath, mode) as corpus_tar:
        corpus_tar.extractall(tmp_dir)
    if ".gz" in lang1_filepath:
      new_filepath = lang1_filepath.strip(".gz")
      generator_utils.gunzip_file(lang1_filepath, new_filepath)
      lang1_filepath = new_filepath
    if ".gz" in lang2_filepath:
      new_filepath = lang2_filepath.strip(".gz")
      generator_utils.gunzip_file(lang2_filepath, new_filepath)
      lang2_filepath = new_filepath
    with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
      with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
        lang1_file_lines = lang1_file.readlines()
        lang2_file_lines = lang2_file.readlines()
        assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath
        lang1_lines.extend(lang1_file_lines)
        lang2_lines.extend(lang2_file_lines)

  write_chunk_size = 10000
  assert len(lang1_lines) == len(lang2_lines)
  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
    i = 0
    while i <= len(lang1_lines):
      for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
        lang1_file.write(line)
      i += 1
    for line in lang1_lines[i * write_chunk_size:]:
      lang1_file.write(line)
  with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
    i = 0
    while i <= len(lang2_lines):
      for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
        lang2_file.write(line)
      i += 1
    for line in lang2_lines[i * write_chunk_size:]:
      lang2_file.write(line)
  return filename
Exemple #5
0
def _maybe_download_corpus(tmp_dir):
  """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.
  """
  corpus_url = ("http://www.statmt.org/lm-benchmark/"
                "1-billion-word-language-modeling-benchmark-r13output.tar.gz")
  corpus_filename = os.path.basename(corpus_url)
  corpus_filepath = os.path.join(tmp_dir, corpus_filename)
  if not os.path.exists(corpus_filepath):
    generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
    with tarfile.open(corpus_filepath, "r:gz") as corpus_tar:
      corpus_tar.extractall(tmp_dir)
def load_examples(tmp_dir, prop_train=0.09, prop_val=0.01):
  """Loads exampls from the tsv file.

  Args:
    tmp_dir: temp directory.
    prop_train: proportion of the train data
    prop_val: proportion of the validation data

  Returns:
    All examples in the dataset pluse train, test, and development splits.

  """

  infile = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
  tf.logging.info('Loading examples')

  all_examples = []
  for i, d in enumerate(csv.DictReader(gzip.open(infile), delimiter='\t')):
    if i % 100000 == 0:
      tf.logging.info('%d examples have been loaded....' % i)
    ex = {x: int(y) if y.isdigit() else y for x, y in d.items()}
    all_examples.append(ex)

  random.seed(1)
  random.shuffle(all_examples)
  n_train = int(len(all_examples) * prop_train)
  n_val = n_train + int(len(all_examples) * prop_val)
  train = all_examples[:n_train]
  val = all_examples[n_train:n_val]
  test = []
  for e in all_examples[n_val:]:
    if e['n_intervening'] == e['n_diff_intervening']:
      test.append(e)

  return all_examples, train, val, test
Exemple #7
0
def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename):
  """Downloading and preparing the dataset.

  Args:
    tmp_dir: tem directory
    data_dir: data directory
    vocab_size: size of vocabulary
    vocab_filename: name of vocab file

  """

  if not tf.gfile.Exists(data_dir):
    tf.gfile.MakeDirs(data_dir)

  file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
  tar_all = tarfile.open(file_path)
  tar_all.extractall(tmp_dir)
  tar_all.close()
  tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar"))
  tar_train.extractall(tmp_dir)
  tar_train.close()

  vocab_path = os.path.join(data_dir, vocab_filename)
  if not tf.gfile.Exists(vocab_path):
    with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile:
      reader = csv.reader(infile, delimiter="\t")
      words = [row[0] for row in reader]
      words = [_UNK] + words[:vocab_size]
    with tf.gfile.GFile(vocab_path, "w") as outfile:
      outfile.write("\n".join(words))
  def generator(self, data_dir, tmp_dir, datasets,
                eos_list=None, start_from=0, how_many=0):
    i = 0
    for url, subdir in datasets:
      filename = os.path.basename(url)
      compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)

      read_type = "r:gz" if filename.endswith("tgz") else "r"
      with tarfile.open(compressed_file, read_type) as corpus_tar:
        # Create a subset of files that don't already exist.
        #   tarfile.extractall errors when encountering an existing file
        #   and tarfile.extract is extremely slow
        members = []
        for f in corpus_tar:
          if not os.path.isfile(os.path.join(tmp_dir, f.name)):
            members.append(f)
        corpus_tar.extractall(tmp_dir, members=members)

      data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
      data_files = _collect_data(data_dir, "flac", "txt")
      data_pairs = data_files.values()

      encoders = self.feature_encoders(None)
      audio_encoder = encoders["waveforms"]
      text_encoder = encoders["targets"]

      for media_file, text_data in sorted(data_pairs)[start_from:]:
        if how_many > 0 and i == how_many:
          return
        i += 1
        yield {
            "waveforms": audio_encoder.encode(media_file),
            "targets": text_encoder.encode(text_data)
        }
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """A generator to return data samples.Returns the data generator to return.


    Args:
      data_dir: A string representing the data directory.
      tmp_dir: A string representing the temporary directory and is
              used to download files if not already available.
      dataset_split: Train, Test or Eval.

    Yields:
      Each element yielded is of a Python dict of the form
        {"inputs": "STRING", "targets": "STRING"}
    """

    # TODO(sanyamkapoor): Manually separate train/eval data set.
    csv_file_names = self.pair_files_list
    csv_files = [
        generator_utils.maybe_download(tmp_dir, file_list[0], uri)
        for uri, file_list in csv_file_names
    ]

    for pairs_file in csv_files:
      tf.logging.debug("Reading {}".format(pairs_file))
      with open(pairs_file, "r") as csv_file:
        for line in csv_file:
          reader = csv.reader(StringIO(line))
          for docstring_tokens, function_tokens in reader:
            yield {"inputs": docstring_tokens, "targets": function_tokens}
Exemple #10
0
def _maybe_download_corpus(tmp_dir, vocab_type):
  """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_type: which vocabulary are we using.

  Returns:
    The list of names of files.
  """
  filename = os.path.basename(PTB_URL)
  compressed_filepath = generator_utils.maybe_download(
      tmp_dir, filename, PTB_URL)
  ptb_files = []
  ptb_char_files = []

  with tarfile.open(compressed_filepath, "r:gz") as tgz:
    files = []
    # Selecting only relevant files.
    for m in tgz.getmembers():
      if "ptb" in m.name and ".txt" in m.name:
        if "char" in m.name:
          ptb_char_files += [m.name]
        else:
          ptb_files += [m.name]
        files += [m]

    tgz.extractall(tmp_dir, members=files)

  if vocab_type == text_problems.VocabType.CHARACTER:
    return ptb_char_files
  else:
    return ptb_files
Exemple #11
0
def _maybe_download_corpus(tmp_dir):
  """Download corpus if necessary.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    filepath of the downloaded corpus file.
  """
  corpus_url = ("https://dumps.wikimedia.org/enwiki/20170620/"
                "enwiki-20170620-pages-articles-multistream.xml.bz2")
  corpus_filename = os.path.basename(corpus_url)
  corpus_filepath = os.path.join(tmp_dir, corpus_filename)
  if not tf.gfile.Exists(corpus_filepath):
    generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
  return corpus_filepath
Exemple #12
0
def _get_mscoco(directory):
  """Download and extract MSCOCO datasets to directory unless it is there."""
  for url in _MSCOCO_URLS:
    filename = os.path.basename(url)
    download_url = os.path.join(_MSCOCO_ROOT_URL, url)
    path = generator_utils.maybe_download(directory, filename, download_url)
    unzip_dir = os.path.join(directory, filename.strip(".zip"))
    if not tf.gfile.Exists(unzip_dir):
      zipfile.ZipFile(path, "r").extractall(directory)
Exemple #13
0
def _get_vqa_v2_image_raw_dataset(directory, image_root_url, image_urls):
  """Extract the VQA V2 image data set to directory unless it's there."""
  for url in image_urls:
    filename = os.path.basename(url)
    download_url = os.path.join(image_root_url, url)
    path = generator_utils.maybe_download(directory, filename, download_url)
    unzip_dir = os.path.join(directory, filename.strip(".zip"))
    if not tf.gfile.Exists(unzip_dir):
      zipfile.ZipFile(path, "r").extractall(directory)
Exemple #14
0
def _download_and_parse_dataset(tmp_dir, train):
  """Downloads and prepairs the dataset to be parsed by the data_generator."""
  file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL)
  zip_ref = zipfile.ZipFile(file_path, 'r')
  zip_ref.extractall(tmp_dir)
  zip_ref.close()

  file_name = 'train' if train else 'dev'
  dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name)
  _parse_dataset(dataset_file_path, tmp_dir, train)
Exemple #15
0
  def _maybe_download_corpora(self, tmp_dir):
    qnli_filename = "QNLI.zip"
    qnli_finalpath = os.path.join(tmp_dir, "QNLI")
    if not tf.gfile.Exists(qnli_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, qnli_filename, self._QNLI_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return qnli_finalpath
Exemple #16
0
  def _maybe_download_corpora(self, tmp_dir):
    scitail_filename = "SciTailV1.1.zip"
    scitail_finalpath = os.path.join(tmp_dir, "SciTailV1.1")
    if not tf.gfile.Exists(scitail_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, scitail_filename, self._SCITAIL_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return scitail_finalpath
Exemple #17
0
  def _maybe_download_corpora(self, tmp_dir):
    cola_filename = "CoLA.zip"
    cola_finalpath = os.path.join(tmp_dir, "CoLA")
    if not tf.gfile.Exists(cola_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, cola_filename, self._COLA_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return cola_finalpath
Exemple #18
0
  def _maybe_download_corpora(self, tmp_dir):
    sst_binary_filename = "SST-2.zip"
    sst_binary_finalpath = os.path.join(tmp_dir, "SST-2")
    if not tf.gfile.Exists(sst_binary_finalpath):
      zip_filepath = generator_utils.maybe_download(
          tmp_dir, sst_binary_filename, self._SST2_URL)
      zip_ref = zipfile.ZipFile(zip_filepath, "r")
      zip_ref.extractall(tmp_dir)
      zip_ref.close()

    return sst_binary_finalpath
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file)
def _maybe_download_corpora(tmp_dir, dataset_split):
  """Download corpora if necessary and unzip them.

  Args:
    tmp_dir: directory containing dataset.
    dataset_split: whether we're in train/dev/test mode.

  Returns:
    List of all files generated and path to file containing
      train/dev/test split info.
  """
  cnn_filename = "cnn_stories.tgz"
  cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
  dailymail_filename = "dailymail_stories.tgz"
  dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
  if not tf.gfile.Exists(cnn_finalpath):
    cnn_file = generator_utils.maybe_download_from_drive(
        tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL)
    with tarfile.open(cnn_file, "r:gz") as cnn_tar:
      cnn_tar.extractall(tmp_dir)
  if not tf.gfile.Exists(dailymail_finalpath):
    dailymail_file = generator_utils.maybe_download_from_drive(
        tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
    with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
      dailymail_tar.extractall(tmp_dir)

  cnn_files = tf.gfile.Glob(cnn_finalpath + "*")
  dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*")
  all_files = cnn_files + dailymail_files

  if dataset_split == problem.DatasetSplit.TRAIN:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt",
                                               _TRAIN_URLS)
  elif dataset_split == problem.DatasetSplit.EVAL:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt",
                                               _DEV_URLS)
  else:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt",
                                               _TEST_URLS)

  return all_files, urls_path
Exemple #21
0
def _original_vocab(tmp_dir):
  """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
  vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
               "vocab-2016-09-10.txt")
  vocab_filename = os.path.basename(vocab_url + ".en")
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if not os.path.exists(vocab_filepath):
    generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
  return set([
      text_encoder.native_to_unicode(l.strip())
      for l in tf.gfile.Open(vocab_filepath)
  ])
  def testMaybeDownload(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Download Google index to the temporary file.http.
    res_path = generator_utils.maybe_download(tmp_dir, tmp_file_name + ".http",
                                              "http://google.com")
    self.assertEqual(res_path, tmp_file_path + ".http")

    # Clean up.
    os.remove(tmp_file_path + ".http")
    os.remove(tmp_file_path)
  def maybe_download_dataset(self, tmp_dir, dataset_split):
    """Downloads the appropriate dataset file and returns its path."""
    # Get the dataset url for the split requested.
    url = self.DATA_URLS.get(dataset_split, None)

    # Sanity check.
    if url is None:
      tf.logging.fatal("Unknown dataset_split passed: {}".format(dataset_split))

    # Download the data, if it doesn't already exist.
    return generator_utils.maybe_download(tmp_dir,
                                          self._extract_filename_from_url(url),
                                          url)
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    try:
      # Download source data if download_url specified
      h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
                                                   self.download_url)
    except NotImplementedError:
      # Otherwise, look for it locally
      h5_filepath = os.path.join(tmp_dir, self.h5_file)

    with h5py.File(h5_filepath, "r") as h5_file:
      num_train_examples = h5_file["train_in"].len()
      num_dev_examples = h5_file["valid_in"].len()
      num_test_examples = h5_file["test_in"].len()

    # Collect all_filepaths to later shuffle
    all_filepaths = []
    # Collect created shard processes to start and join
    processes = []

    datasets = [(self.training_filepaths, self.num_shards, "train",
                 num_train_examples), (self.dev_filepaths, 10, "valid",
                                       num_dev_examples),
                (self.test_filepaths, 10, "test", num_test_examples)]
    for fname_fn, nshards, key_prefix, num_examples in datasets:
      outfiles = fname_fn(data_dir, nshards, shuffled=False)
      all_filepaths.extend(outfiles)
      for start_idx, end_idx, outfile in generate_shard_args(
          outfiles, num_examples):
        p = mp.Process(
            target=generate_dataset,
            args=(h5_filepath, key_prefix, [outfile], self.chunk_size,
                  start_idx, end_idx))
        processes.append(p)

    # 1 per training shard + 10 for dev + 10 for test
    assert len(processes) == self.num_shards + 20

    # Start and wait for processes in batches
    num_batches = int(
        math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
    for i in range(num_batches):
      start = i * MAX_CONCURRENT_PROCESSES
      end = start + MAX_CONCURRENT_PROCESSES
      current = processes[start:end]
      for p in current:
        p.start()
      for p in current:
        p.join()

    # Shuffle
    generator_utils.shuffle_dataset(all_filepaths)
    def testMaybeDownload(self):
        tmp_dir = self.get_temp_dir()
        (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
        tmp_file_name = os.path.basename(tmp_file_path)

        # Download Google index to the temporary file.http.
        res_path = generator_utils.maybe_download(tmp_dir,
                                                  tmp_file_name + ".http",
                                                  "http://google.com")
        self.assertEqual(res_path, tmp_file_path + ".http")

        # Clean up.
        os.remove(tmp_file_path + ".http")
        os.remove(tmp_file_path)
Exemple #26
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        try:
            # Download source data if download_url specified
            h5_filepath = generator_utils.maybe_download(
                tmp_dir, self.h5_file, self.download_url)
        except NotImplementedError:
            # Otherwise, look for it locally
            h5_filepath = os.path.join(tmp_dir, self.h5_file)

        with h5py.File(h5_filepath, "r") as h5_file:
            num_train_examples = h5_file["train_in"].len()
            num_dev_examples = h5_file["valid_in"].len()
            num_test_examples = h5_file["test_in"].len()

        # Collect all_filepaths to later shuffle
        all_filepaths = []
        # Collect created shard processes to start and join
        processes = []

        datasets = [(self.training_filepaths, self.num_shards, "train",
                     num_train_examples),
                    (self.dev_filepaths, 10, "valid", num_dev_examples),
                    (self.test_filepaths, 10, "test", num_test_examples)]
        for fname_fn, nshards, key_prefix, num_examples in datasets:
            outfiles = fname_fn(data_dir, nshards, shuffled=False)
            all_filepaths.extend(outfiles)
            for start_idx, end_idx, outfile in generate_shard_args(
                    outfiles, num_examples):
                p = mp.Process(target=generate_dataset,
                               args=(h5_filepath, key_prefix, [outfile],
                                     self.chunk_size, start_idx, end_idx))
                processes.append(p)

        # 1 per training shard + 10 for dev + 10 for test
        assert len(processes) == self.num_shards + 20

        # Start and wait for processes in batches
        num_batches = int(
            math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
        for i in range(num_batches):
            start = i * MAX_CONCURRENT_PROCESSES
            end = start + MAX_CONCURRENT_PROCESSES
            current = processes[start:end]
            for p in current:
                p.start()
            for p in current:
                p.join()

        # Shuffle
        generator_utils.shuffle_dataset(all_filepaths)
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Downloads and extracts the dataset and generates examples.

    Args:
      data_dir: The base directory where data and vocab files are stored.
      tmp_dir: temp directory to download and extract the dataset.
      dataset_split: split of the data-set.

    Yields:
      The data examples.
    """
        # Create directories if needed.
        if not tf.gfile.Exists(tmp_dir):
            tf.gfile.MakeDirs(tmp_dir)
        if not tf.gfile.Exists(data_dir):
            tf.gfile.MakeDirs(data_dir)

        # Download and extract the data.
        filename = os.path.basename(_URL)
        path = generator_utils.maybe_download(tmp_dir, filename, _URL)
        tarfile.open(path, "r:gz").extractall(tmp_dir)

        # Create the list of directories with data files.
        train_dirs = [
            "v1.0/train-easy", "v1.0/train-medium", "v1.0/train-hard"
        ]
        eval_dirs = ["v1.0/interpolate", "v1.0/extrapolate"]
        dirs = eval_dirs
        if dataset_split == problem.DatasetSplit.TRAIN:
            dirs = train_dirs
        dirs = [os.path.join(tmp_dir, d) for d in dirs]

        # Iterate over directories and files generating examples.
        for d in dirs:
            files = tf.gfile.Glob(d + "/*.txt")
            for fname in files:
                # In each text file, the first line is the input, the next the answer,
                # and so on until the end of the file.
                cur_input = None
                with tf.gfile.Open(fname, "rb") as f:
                    for line in f:
                        if cur_input is None:
                            cur_input = line.strip()
                        else:
                            yield {
                                "inputs": cur_input,
                                "targets": line.strip()
                            }
                            cur_input = None
Exemple #28
0
def _get_vqa_v2_dataset(directory):
    """Extract the VQA V2 data set to directory unless it's there."""
    for url in _MSCOCO_IMAGE_URLS:
        filename = os.path.basename(url)
        download_url = os.path.join(_MSCOCO_ROOT_URL, url)
        path = generator_utils.maybe_download(directory, filename,
                                              download_url)
        unzip_dir = os.path.join(directory, filename.strip(".zip"))
        if not tf.gfile.Exists(unzip_dir):
            zipfile.ZipFile(path, "r").extractall(directory)

    annotation_file = generator_utils.maybe_download_from_drive(
        directory, "vqa_v2.tar.gz", _VQA_V2_ANNOTATION_URL)
    with tarfile.open(annotation_file, "r:gz") as annotation_tar:
        annotation_tar.extractall(directory)
Exemple #29
0
    def maybe_download_and_unzip(self, tmp_dir):
        """Downloads deepsea data if it doesn"t already exist.

    Args:
      tmp_dir: String. The directory to maybe download to.

    Returns:
      String. The directory path where the unprocessed data was downloaded and
        unzipped.
    """
        url = ("http://deepsea.princeton.edu/media/code/"
               "deepsea_train_bundle.v0.9.tar.gz")
        filename = "deepsea_train_bundle.v0.9.tar.gz"
        generator_utils.maybe_download(tmp_dir, filename, url)
        dirpath = os.path.join(tmp_dir, "deepsea_train")
        if not os.path.exists(dirpath):
            tf.logging.info(
                f"Extracting archive {filename} to directory: {dirpath}")
            filepath = os.path.join(tmp_dir, filename)
            tarfile.open(filepath, "r:gz").extractall(tmp_dir)
        else:
            tf.logging.info(
                f"Not extracting archive, directory already found: {dirpath}")
        return dirpath
Exemple #30
0
    def _maybe_download(self, tmp_dir, dataset_split):
        filename = os.path.basename(_MS_COCO_ZIPPED_FILE)
        download_url = os.path.join(_MS_COCO_DOWNLOAD_URL, filename)
        path = generator_utils.maybe_download(tmp_dir, filename, download_url)
        unzip_dir = os.path.join(tmp_dir, filename.strip(".zip"))
        if not tf.gfile.Exists(unzip_dir):
            tf.logging.info("Unzipping data to {}".format(unzip_dir))
            zipfile.ZipFile(path, "r").extractall(unzip_dir)

        if dataset_split == problem.DatasetSplit.TRAIN:
            ms_coco_file = _MS_COCO_TRAIN_FILE
        else:
            ms_coco_file = _MS_COCO_DEV_FILE
        ms_coco_path = os.path.join(unzip_dir, "annotations", ms_coco_file)
        return ms_coco_path
  def _maybe_download(self, tmp_dir, dataset_split):
    filename = os.path.basename(_MS_COCO_ZIPPED_FILE)
    download_url = os.path.join(_MS_COCO_DOWNLOAD_URL, filename)
    path = generator_utils.maybe_download(tmp_dir, filename, download_url)
    unzip_dir = os.path.join(tmp_dir, filename.strip(".zip"))
    if not tf.gfile.Exists(unzip_dir):
      tf.logging.info("Unzipping data to {}".format(unzip_dir))
      zipfile.ZipFile(path, "r").extractall(unzip_dir)

    if dataset_split == problem.DatasetSplit.TRAIN:
      ms_coco_file = _MS_COCO_TRAIN_FILE
    else:
      ms_coco_file = _MS_COCO_DEV_FILE
    ms_coco_path = os.path.join(unzip_dir, "annotations", ms_coco_file)
    return ms_coco_path
Exemple #32
0
    def generator(self,
                  data_dir,
                  tmp_dir,
                  datasets,
                  eos_list=None,
                  start_from=0,
                  how_many=0):
        del eos_list
        i = 0

        filename = os.path.basename(_COMMONVOICE_URL)
        compressed_file = generator_utils.maybe_download(
            tmp_dir, filename, _COMMONVOICE_URL)

        read_type = "r:gz" if filename.endswith(".tgz") else "r"
        with tarfile.open(compressed_file, read_type) as corpus_tar:
            # Create a subset of files that don't already exist.
            #   tarfile.extractall errors when encountering an existing file
            #   and tarfile.extract is extremely slow. For security, check that all
            #   paths are relative.
            members = [
                f for f in corpus_tar if _is_relative(tmp_dir, f.name)
                and not _file_exists(tmp_dir, f.name)
            ]
            corpus_tar.extractall(tmp_dir, members=members)

        data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
        data_tuples = _collect_data(data_dir)
        encoders = self.feature_encoders(None)
        audio_encoder = encoders["waveforms"]
        text_encoder = encoders["targets"]
        for dataset in datasets:
            data_tuples = (tup for tup in data_tuples
                           if tup[0].startswith(dataset))
            for utt_id, media_file, text_data in tqdm.tqdm(
                    sorted(data_tuples)[start_from:]):
                if how_many > 0 and i == how_many:
                    return
                i += 1
                wav_data = audio_encoder.encode(media_file)
                yield {
                    "waveforms": wav_data,
                    "waveform_lens": [len(wav_data)],
                    "targets": text_encoder.encode(text_data),
                    "raw_transcript": [text_data],
                    "utt_id": [utt_id],
                    "spk_id": ["unknown"],
                }
Exemple #33
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        url = self._URL
        file_name = self._DEV_SET
        if dataset_split == problem.DatasetSplit.TRAIN:
            file_name = self._TRAINING_SET
        squad_file = generator_utils.maybe_download(
            tmp_dir, file_name, os.path.join(url, file_name))
        with tf.gfile.GFile(squad_file, mode="r") as fp:
            squad = json.load(fp)

        version = squad["version"]
        for article in squad["data"]:
            if "title" in article:
                title = article["title"].strip()
            else:
                title = "no title"
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].strip()
                for qa in paragraph["qas"]:
                    question = qa["question"].strip()
                    id_ = qa["id"]

                    answer_starts = [
                        answer["answer_start"] for answer in qa["answers"]
                    ]
                    answers = [
                        answer["text"].strip() for answer in qa["answers"]
                    ]

                    # Features currently used are "context", "question", and "answers".
                    # Others are extracted here for the ease of future expansions.
                    example = {
                        "version": version,
                        "title": title,
                        "context": context,
                        "question": question,
                        "id": id_,
                        "answer_starts": answer_starts,
                        "answers": answers,
                        "num_answers": len(answers),
                        "is_supervised": True,
                    }
                    yield {
                        "inputs": example["question"],
                        # TODO(ddohan, wgaj): Figure out a way of extracting all answers.
                        "targets": example["answers"][0],
                        "context": example["context"]
                    }
Exemple #34
0
def _generate_examples(tmp_dir, dataset_split):
    """Generate squad examples.

  Args:
    tmp_dir: a string
    dataset_split: problem.DatasetSplit.TRAIN or problem.DatasetSplit.EVAL
  Yields:
    dictionaries representing examples
  """
    if dataset_split == problem.DatasetSplit.TRAIN:
        file_name = _TRAINING_SET
    else:
        file_name = _DEV_SET
    squad_file = generator_utils.maybe_download(tmp_dir, file_name,
                                                os.path.join(_URL, file_name))
    with tf.gfile.GFile(squad_file, mode="r") as fp:
        squad = json.load(fp)

    version = squad["version"]
    for article in squad["data"]:
        if "title" in article:
            title = article["title"].strip()
        else:
            title = "no title"
        for paragraph in article["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                question = qa["question"].strip()
                id_ = qa["id"]
                answer_starts = [
                    answer["answer_start"] for answer in qa["answers"]
                ]
                answers = [answer["text"].strip() for answer in qa["answers"]]

                # Features currently used are "context", "question", and "answers".
                # Others are extracted here for the ease of future expansions.
                example = {
                    "version": version,
                    "title": title,
                    "context": context,
                    "question": question,
                    "id": id_,
                    "answer_starts": answer_starts,
                    "answers": answers,
                    "num_answers": len(answers),
                    "is_supervised": True,
                }
                yield example
Exemple #35
0
    def generator(self,
                  data_dir,
                  tmp_dir,
                  datasets,
                  eos_list=None,
                  start_from=0,
                  how_many=0):
        del eos_list
        i = 0
        for url, subdir in datasets:
            filename = os.path.basename(url)
            compressed_file = generator_utils.maybe_download(
                tmp_dir, filename, url)

            read_type = "r:gz" if filename.endswith("tgz") else "r"
            with tarfile.open(compressed_file, read_type) as corpus_tar:
                # Create a subset of files that don't already exist.
                #   tarfile.extractall errors when encountering an existing file
                #   and tarfile.extract is extremely slow
                members = []
                for f in corpus_tar:
                    if not os.path.isfile(os.path.join(tmp_dir, f.name)):
                        members.append(f)
                corpus_tar.extractall(tmp_dir, members=members)

            raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
            data_files = _collect_data(raw_data_dir, "flac", "txt")
            data_pairs = data_files.values()

            encoders = self.feature_encoders(data_dir)
            audio_encoder = encoders["waveforms"]
            text_encoder = encoders["targets"]

            for utt_id, media_file, text_data in sorted(
                    data_pairs)[start_from:]:
                if how_many > 0 and i == how_many:
                    return
                i += 1
                wav_data = audio_encoder.encode(media_file)
                spk_id, unused_book_id, _ = utt_id.split("-")
                yield {
                    "waveforms": wav_data,
                    "waveform_lens": [len(wav_data)],
                    "targets": text_encoder.encode(text_data),
                    "raw_transcript": [text_data],
                    "utt_id": [utt_id],
                    "spk_id": [spk_id],
                }
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    if dataset_split == problem.DatasetSplit.TRAIN:
      urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1])
    else:
      urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1])
      urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1])

    for url in urls:
      path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url)
      for frame_number, frame, state, action in self.parse_frames(path):
        yield {
            "frame_number": [frame_number],
            "frame": frame,
            "state": state,
            "action": action,
        }
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    if dataset_split == problem.DatasetSplit.TRAIN:
      urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1])
    else:
      urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1])
      urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1])

    for url in urls:
      path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url)
      for frame_number, frame, state, action in self.parse_frames(path):
        yield {
            "frame_number": [frame_number],
            "frame": frame,
            "state": state,
            "action": action,
        }
Exemple #38
0
    def generator(self,
                  data_dir,
                  tmp_dir,
                  training,
                  eos_list=None,
                  start_from=0,
                  how_many=0):
        eos_list = [1] if eos_list is None else eos_list
        datasets = (_LIBRISPEECH_TRAIN_DATASETS
                    if training else _LIBRISPEECH_TEST_DATASETS)
        num_reserved_ids = self.feature_encoders(
            None)["targets"].num_reserved_ids
        i = 0
        for url, subdir in datasets:
            filename = os.path.basename(url)
            compressed_file = generator_utils.maybe_download(
                tmp_dir, filename, url)

            read_type = "r:gz" if filename.endswith("tgz") else "r"
            with tarfile.open(compressed_file, read_type) as corpus_tar:
                # Create a subset of files that don't already exist.
                #   tarfile.extractall errors when encountering an existing file
                #   and tarfile.extract is extremely slow
                members = []
                for f in corpus_tar:
                    if not os.path.isfile(os.path.join(tmp_dir, f.name)):
                        members.append(f)
                corpus_tar.extractall(tmp_dir, members=members)

            data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
            data_files = _collect_data(data_dir, "flac", "txt")
            data_pairs = data_files.values()
            for media_file, text_data in sorted(data_pairs)[start_from:]:
                if how_many > 0 and i == how_many:
                    return
                i += 1
                audio_data, sample_count, sample_width, num_channels = _get_audio_data(
                    media_file)
                label = [num_reserved_ids + ord(c)
                         for c in text_data] + eos_list
                yield {
                    "inputs": audio_data,
                    "audio/channel_count": [num_channels],
                    "audio/sample_count": [sample_count],
                    "audio/sample_width": [sample_width],
                    "targets": label
                }
Exemple #39
0
  def generator(self,
                data_dir,
                tmp_dir,
                datasets,
                eos_list=None,
                start_from=0,
                how_many=0):
    del eos_list
    i = 0

    filename = os.path.basename(_COMMONVOICE_URL)
    compressed_file = generator_utils.maybe_download(tmp_dir, filename,
                                                     _COMMONVOICE_URL)

    read_type = "r:gz" if filename.endswith(".tgz") else "r"
    with tarfile.open(compressed_file, read_type) as corpus_tar:
      # Create a subset of files that don't already exist.
      #   tarfile.extractall errors when encountering an existing file
      #   and tarfile.extract is extremely slow. For security, check that all
      #   paths are relative.
      members = [
          f for f in corpus_tar if _is_relative(tmp_dir, f.name) and
          not _file_exists(tmp_dir, f.name)
      ]
      corpus_tar.extractall(tmp_dir, members=members)

    raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1")
    data_tuples = _collect_data(raw_data_dir)
    encoders = self.feature_encoders(data_dir)
    audio_encoder = encoders["waveforms"]
    text_encoder = encoders["targets"]
    for dataset in datasets:
      data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset))
      for utt_id, media_file, text_data in tqdm.tqdm(
          sorted(data_tuples)[start_from:]):
        if how_many > 0 and i == how_many:
          return
        i += 1
        wav_data = audio_encoder.encode(media_file)
        yield {
            "waveforms": wav_data,
            "waveform_lens": [len(wav_data)],
            "targets": text_encoder.encode(text_data),
            "raw_transcript": [text_data],
            "utt_id": [utt_id],
            "spk_id": ["unknown"],
        }
Exemple #40
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        url = self._URL
        file_name = (self._TRAINING_SET if dataset_split
                     == problem.DatasetSplit.TRAIN else self._DEV_SET)
        squad_file = generator_utils.maybe_download(
            tmp_dir, file_name, os.path.join(url, file_name))
        with tf.gfile.GFile(squad_file, mode='r') as fp:
            squad = json.load(fp)

        version = squad['version']
        for article in squad['data']:
            if 'title' in article:
                title = article['title'].strip()
            else:
                title = 'no title'
            for paragraph in article['paragraphs']:
                context = paragraph['context'].strip()
                for qa in paragraph['qas']:
                    question = qa['question'].strip()
                    id_ = qa['id']

                    answer_starts = [
                        answer['answer_start'] for answer in qa['answers']
                    ]
                    answers = [
                        answer['text'].strip() for answer in qa['answers']
                    ]

                    # Features currently used are 'context', 'question', and 'answers'.
                    # Others are extracted here for the ease of future expansions.
                    example = {
                        'version': version,
                        'title': title,
                        'context': context,
                        'question': question,
                        'id': id_,
                        'answer_starts': answer_starts,
                        'answers': answers,
                        'num_answers': len(answers),
                        'is_supervised': True,
                    }
                    yield {
                        'inputs': example['question'],
                        # TODO(ddohan, wgaj): Figure out a way of extracting all answers.
                        'targets': example['answers'][0],
                        'context': example['context']
                    }
Exemple #41
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):  # pylint: disable=no-self-use,unused-argument
        """Returns a generator to return {"inputs": [text], "targets": [text]}."""

        pair_csv_files = [
            generator_utils.maybe_download(data_dir, filename, uri)
            for uri, filename in self.source_data_files(dataset_split)
        ]

        for pairs_file in pair_csv_files:
            with open(pairs_file, 'r') as csv_file:
                pairs_reader = csv.reader(csv_file)
                for row in pairs_reader:
                    function_tokens, docstring_tokens = row[-2:]
                    yield {
                        'inputs': docstring_tokens,
                        'targets': function_tokens
                    }
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Generate examples."""
        # Download and extract
        compressed_filename = os.path.basename(self.URL)
        download_path = generator_utils.maybe_download(tmp_dir,
                                                       compressed_filename,
                                                       self.URL)
        imdb_dir = os.path.join(tmp_dir, "aclImdb")
        if not tf.gfile.Exists(imdb_dir):
            with tarfile.open(download_path, "r:gz") as tar:
                tar.extractall(tmp_dir)

        # Generate examples
        train = dataset_split == problem.DatasetSplit.TRAIN
        dataset = "train" if train else "test"
        # punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        punctuation = '!#$%&\()*+-/:;.<=>?@[\\]^_`{|}'
        for doc, label in self.doc_generator(imdb_dir,
                                             dataset,
                                             include_label=True):
            sentences = sent_tokenize(doc)
            # if len(sentences) > 3:
            #     sentence = sentences[:3]
            #
            # else:
            #     sentence = sentences
            #
            # sentence = " ".join(sentence)
            sentence = sentences[:]
            sentence = ' '.join(sentence)

            tokens = word_tokenize(sentence)
            table = str.maketrans('', '', punctuation)
            stripped = [w.translate(table) for w in tokens]
            tokens = [word.lower() for word in stripped if word != 'br']
            # 512 is the max len for the bert models
            if len(tokens) < self.MAX_LEN_SENT:
                # tokens = [word for word in stripped]
                doc = ' '.join(tokens)
                doc = re.sub('\'\'', '', doc).strip()
                doc = re.sub('\s+', ' ', doc).strip()
                yield {
                    "inputs": doc,
                    "label": int(label),
                }
Exemple #43
0
def _maybe_download_corpora(tmp_dir):
  """Download corpora for multinli.

  Args:
    tmp_dir: a string
  Returns:
    a string
  """
  mnli_filename = "MNLI.zip"
  mnli_finalpath = os.path.join(tmp_dir, "MNLI")
  if not tf.gfile.Exists(mnli_finalpath):
    zip_filepath = generator_utils.maybe_download(
        tmp_dir, mnli_filename, _MNLI_URL)
    zip_ref = zipfile.ZipFile(zip_filepath, "r")
    zip_ref.extractall(tmp_dir)
    zip_ref.close()

  return mnli_finalpath
Exemple #44
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        compressed_filename = os.path.basename(self.URL)
        download_path = generator_utils.maybe_download(tmp_dir,
                                                       compressed_filename,
                                                       self.URL)

        dir = os.path.join(tmp_dir, 'mtl-dataset')
        if not tf.gfile.Exists(dir):
            with tarfile.open(download_path, "r:gz") as tar:
                tar.extractall(tmp_dir)

        train = dataset_split == problem.DatasetSplit.TRAIN
        dataset = "train" if train else "test"
        for doc, label in self.doc_generator(dir, dataset, include_label=True):
            yield {
                "inputs": doc,
                "label": int(label),
            }
Exemple #45
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    url = self._URL
    file_name = (self._TRAINING_SET if dataset_split ==
                 problem.DatasetSplit.TRAIN else self._DEV_SET)
    squad_file = generator_utils.maybe_download(tmp_dir,
                                                file_name,
                                                os.path.join(url, file_name))
    with tf.gfile.GFile(squad_file, mode='r') as fp:
      squad = json.load(fp)

    version = squad['version']
    for article in squad['data']:
      if 'title' in article:
        title = article['title'].strip()
      else:
        title = 'no title'
      for paragraph in article['paragraphs']:
        context = paragraph['context'].strip()
        for qa in paragraph['qas']:
          question = qa['question'].strip()
          id_ = qa['id']

          answer_starts = [answer['answer_start'] for answer in qa['answers']]
          answers = [answer['text'].strip() for answer in qa['answers']]

          # Features currently used are 'context', 'question', and 'answers'.
          # Others are extracted here for the ease of future expansions.
          example = {
              'version': version,
              'title': title,
              'context': context,
              'question': question,
              'id': id_,
              'answer_starts': answer_starts,
              'answers': answers,
              'num_answers': len(answers),
              'is_supervised': True,
          }
          yield {
              'inputs': example['question'],
              # TODO(ddohan, wgaj): Figure out a way of extracting all answers.
              'targets': example['answers'][0],
              'context': example['context']
          }
Exemple #46
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
               "street/python/fsns_urls.txt")
   fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
                                              list_url)
   fsns_files = [
       f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
   ]
   for url in fsns_files:
     if "/train/train" in url:
       generator_utils.maybe_download(
           data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
     elif "/validation/validation" in url:
       generator_utils.maybe_download(
           data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
     elif "charset" in url:
       generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
Exemple #47
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
               "street/python/fsns_urls.txt")
   fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt",
                                              list_url)
   fsns_files = [
       f.strip() for f in open(fsns_urls, "r") if f.startswith("http://")
   ]
   for url in fsns_files:
     if "/train/train" in url:
       generator_utils.maybe_download(
           data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
     elif "/validation/validation" in url:
       generator_utils.maybe_download(
           data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
     elif "charset" in url:
       generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
Exemple #48
0
def _maybe_download_corpus(tmp_dir, vocab_type):
    """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_type: which vocabulary are we using.

  Returns:
    The list of names of files.
  """
    if vocab_type == text_problems.VocabType.CHARACTER:

        dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext"
                       "/wikitext-103-raw-v1.zip")
        dir_name = "wikitext-103-raw"
    else:
        dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext"
                       "/wikitext-103-v1.zip")
        dir_name = "wikitext-103"

    fname = os.path.basename(dataset_url)
    compressed_filepath = generator_utils.maybe_download(
        tmp_dir, fname, dataset_url)
    zip_ref = zipfile.ZipFile(compressed_filepath, "r")
    zip_ref.extractall(tmp_dir)
    zip_ref.close()

    files = os.path.join(tmp_dir, dir_name, "*")
    train_file, valid_file, test_file = None, None, None
    for f in tf.gfile.Glob(files):
        fname = os.path.basename(f)
        if "train" in fname:
            train_file = f
        elif "valid" in fname:
            valid_file = f
        elif "test" in fname:
            test_file = f

    assert train_file, "Training file not found"
    assert valid_file, "Validation file not found"
    assert test_file, "Testing file not found"

    return train_file, valid_file, test_file
Exemple #49
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        filename = os.path.basename(PTB_URL)
        compressed_filepath = generator_utils.maybe_download(
            tmp_dir, filename, PTB_URL)
        ptb_files = []
        ptb_char_files = []
        with tarfile.open(compressed_filepath, "r:gz") as tgz:
            files = []
            # Selecting only relevant files.
            for m in tgz.getmembers():
                if "ptb" in m.name and ".txt" in m.name:
                    if "char" in m.name:
                        ptb_char_files += [m.name]
                    else:
                        ptb_files += [m.name]
                    files += [m]

            tgz.extractall(tmp_dir, members=files)

        if self.vocab_type == text_problems.VocabType.CHARACTER:
            files = ptb_char_files
        else:
            files = ptb_files

        train_file, valid_file = None, None
        for filename in files:
            if "train" in filename:
                train_file = os.path.join(tmp_dir, filename)
            elif "valid" in filename:
                valid_file = os.path.join(tmp_dir, filename)

        assert train_file, "Training file not found"
        assert valid_file, "Validation file not found"

        _get_token_encoder(data_dir, self.vocab_filename, train_file)

        train = dataset_split == problem.DatasetSplit.TRAIN
        filepath = train_file if train else valid_file

        with tf.gfile.GFile(filepath, "r") as f:
            for line in f:
                line = " ".join(line.replace("\n", " %s " % EOS).split())
                yield {"targets": line}
Exemple #50
0
def _maybe_download_corpus(tmp_dir):
    """Download and unpack the corpus.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    path to entire corpus as a text file.
  """
    corpus_url = "http://mattmahoney.net/dc/enwik8.zip"
    corpus_filename = os.path.basename(corpus_url)
    compressed_filepath = generator_utils.maybe_download(
        tmp_dir, corpus_filename, corpus_url)

    zip_ref = zipfile.ZipFile(compressed_filepath, "r")
    zip_ref.extractall(tmp_dir)
    zip_ref.close()

    return os.path.join(tmp_dir, "enwik8")
Exemple #51
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate examples."""
    # Download and extract
    compressed_filename = os.path.basename(self.URL)
    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
                                                   self.URL)
    imdb_dir = os.path.join(tmp_dir, "aclImdb")
    if not tf.gfile.Exists(imdb_dir):
      with tarfile.open(download_path, "r:gz") as tar:
        tar.extractall(tmp_dir)

    # Generate examples
    train = dataset_split == problem.DatasetSplit.TRAIN
    dataset = "train" if train else "test"
    for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True):
      yield {
          "inputs": doc,
          "label": int(label),
      }
Exemple #52
0
    def generator(self, data_dir, tmp_dir, train):
        filename = os.path.basename(PTB_URL)
        compressed_filepath = generator_utils.maybe_download(
            tmp_dir, filename, PTB_URL)
        ptb_files = []
        ptb_char_files = []
        with tarfile.open(compressed_filepath, "r:gz") as tgz:
            files = []
            # Selecting only relevant files.
            for m in tgz.getmembers():
                if "ptb" in m.name and ".txt" in m.name:
                    if "char" in m.name:
                        ptb_char_files += [m.name]
                    else:
                        ptb_files += [m.name]
                    files += [m]

            tgz.extractall(tmp_dir, members=files)

        if self.is_character_level:
            files = ptb_char_files
        else:
            files = ptb_files

        train_file, valid_file = None, None
        for filename in files:
            if "train" in filename:
                train_file = os.path.join(tmp_dir, filename)
            elif "valid" in filename:
                valid_file = os.path.join(tmp_dir, filename)

        assert train_file, "Training file not found"
        assert valid_file, "Validation file not found"

        if self.is_character_level:
            encoder = text_encoder.ByteTextEncoder()
        else:
            encoder = _get_token_encoder(data_dir, self.vocab_file, train_file)

        if train:
            return self._generator(train_file, encoder)
        return self._generator(valid_file, encoder)
Exemple #53
0
def load_examples(tmp_dir, equalize_classes=False):
  """Loads exampls from the tsv file.

  Args:
    tmp_dir: temp directory.
    equalize_classes: if equalize number of examples in the classes.

  Returns:
    All examples in the dataset.

  """

  infile = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
  tf.logging.info('Loading examples')

  all_examples = []
  for i, d in enumerate(csv.DictReader(gzip.open(infile), delimiter='\t')):
    if i % 100000 == 0:
      tf.logging.info('%d examples have been loaded....' % i)
    ex = {x: int(y) if y.isdigit() else y for x, y in d.items()}
    all_examples.append(ex)

  classes = defaultdict(list)
  for ex in all_examples:
    classes[ex['verb_pos']].append(ex)

  del all_examples[:]
  assert len(classes) == 2

  c1 = classes.values()[0]
  c2 = classes.values()[1]
  random.seed(1)
  random.shuffle(c1)
  random.shuffle(c2)
  if equalize_classes:
    l = min(len(c1), len(c2))
    all_examples = c1[:l] + c2[:l]
  else:
    all_examples = c1 + c2
  random.shuffle(all_examples)

  return all_examples
Exemple #54
0
def _prepare_babi_data(tmp_dir, data_dir):
    """Downloads and extracts the dataset.

  Args:
    tmp_dir: temp directory to download and extract the dataset
    data_dir: The base directory where data and vocab files are stored.

  Returns:
    tmp_dir: temp directory containing the raw data.
  """
    if not tf.gfile.Exists(data_dir):
        tf.gfile.MakeDirs(data_dir)

    # TODO(dehghani@): find a solution for blocking user-agent (download)
    file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
    tar = tarfile.open(file_path)
    tar.extractall(tmp_dir)
    tar.close()

    return tmp_dir
Exemple #55
0
    def generate_data(self, data_dir, tmp_dir, num_shards=None):
        if num_shards is None:
            num_shards = 100

        # Download source data
        h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
                                                     self.download_url)
        with h5py.File(h5_filepath, "r") as h5_file:
            num_train_examples = h5_file["train_in"].len()
            num_dev_examples = h5_file["valid_in"].len()
            num_test_examples = h5_file["test_in"].len()

        # Collect all_filepaths to later shuffle
        all_filepaths = []
        # Collect created shard processes to start and join
        processes = []

        datasets = [(self.training_filepaths, num_shards, "train",
                     num_train_examples),
                    (self.dev_filepaths, 1, "valid", num_dev_examples),
                    (self.test_filepaths, 1, "test", num_test_examples)]
        for fname_fn, nshards, key_prefix, num_examples in datasets:
            outfiles = fname_fn(data_dir, nshards, shuffled=False)
            all_filepaths.extend(outfiles)
            for start_idx, end_idx, outfile in generate_shard_args(
                    outfiles, num_examples):
                p = mp.Process(target=generate_dataset,
                               args=(h5_filepath, key_prefix, [outfile],
                                     start_idx, end_idx))
                processes.append(p)

        # Start and wait for processes
        assert len(
            processes) == num_shards + 2  # 1 per training shard + dev + test
        for p in processes:
            p.start()
        for p in processes:
            p.join()

        # Shuffle
        generator_utils.shuffle_dataset(all_filepaths)
Exemple #56
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    path = generator_utils.maybe_download(
        tmp_dir, os.path.basename(DATA_URL), DATA_URL)

    tar = tarfile.open(path)
    tar.extractall(tmp_dir)
    tar.close()

    if dataset_split == problem.DatasetSplit.TRAIN:
      base_dir = os.path.join(tmp_dir, "softmotion30_44k/train/*")
    else:
      base_dir = os.path.join(tmp_dir, "softmotion30_44k/test/*")

    filenames = tf.gfile.Glob(base_dir)
    for frame_number, frame, state, action in self.parse_frames(filenames):
      yield {
          "frame_number": [frame_number],
          "frame": frame,
          "state": state,
          "action": action,
      }
def download_and_extract_data(tmp_dir, dataset):
    """Download and Extract files."""
    url = dataset[0]
    print(dataset)
    compressed_filename = os.path.basename(url)
    compressed_file = generator_utils.maybe_download(tmp_dir,
                                                     compressed_filename, url)

    for file in dataset[1]:
        tf.logging.info("Reading file: %s" % file)
        filepath = os.path.join(tmp_dir, file)

        # Extract from tar if needed.
        if not tf.gfile.Exists(filepath):
            with tarfile.open(compressed_file, "r:gz") as corpus_tar:
                corpus_tar.extractall(tmp_dir)

    documents_filename, labels_filename = dataset[1]
    documents_filepath = os.path.join(tmp_dir, documents_filename)
    labels_filepath = os.path.join(tmp_dir, labels_filename)
    return documents_filepath, labels_filepath
Exemple #58
0
  def __init__(self, tmp_dir, data_dir, char=False):
    assert not char, "char mode for PTB is not yet implemented"
    self.char = char
    self.data_dir = data_dir

    url = PTB_URL
    filename = os.path.basename(url)
    compressed_filepath = generator_utils.maybe_download(
        tmp_dir, filename, url)
    ptb_files = []
    ptb_char_files = []
    with tarfile.open(compressed_filepath, "r:gz") as tgz:
      files = []
      # Selecting only relevant files.
      for m in tgz.getmembers():
        if "ptb" in m.name and ".txt" in m.name:
          if "char" in m.name:
            ptb_char_files += [m.name]
          else:
            ptb_files += [m.name]
          files += [m]

      tgz.extractall(tmp_dir, members=files)

    if self.char:
      files = ptb_char_files
    else:
      files = ptb_files
    files = files

    for filename in files:
      if "train" in filename:
        self.train = os.path.join(tmp_dir, filename)
      elif "valid" in filename:
        self.valid = os.path.join(tmp_dir, filename)

    assert hasattr(self, "train"), "Training file not found"
    assert hasattr(self, "valid"), "Validation file not found"
    self.encoder = _get_token_encoder(data_dir, self.train)
Exemple #59
0
def write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training):
  """Write text to files."""

  def write_to_file(all_files, urls_path, tmp_dir, filename):
    with io.open(os.path.join(tmp_dir, filename + ".source"), "w") as fstory:
      with io.open(os.path.join(tmp_dir, filename + ".target"),
                   "w") as fsummary:
        for example in example_generator(all_files, urls_path, sum_token=True):
          story, summary = _story_summary_split(example)
          fstory.write(story + "\n")
          fsummary.write(summary + "\n")

  filename = "cnndm.train" if is_training else "cnndm.dev"
  tf.logging.info("Writing %s" % filename)
  write_to_file(all_files, urls_path, tmp_dir, filename)

  if not is_training:
    test_urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt",
                                                    _TEST_URLS)
    filename = "cnndm.test"
    tf.logging.info("Writing %s" % filename)
    write_to_file(all_files, test_urls_path, tmp_dir, filename)
Exemple #60
0
def video_file_to_cbt(remote_file_path,
                      selection,
                      tmp_dir,
                      shard_id,
                      num_shards,
                      video_id,
                      downsample_xy_dims=64,
                      greyscale=True,
                      resample_every=2,
                      audio_block_size=1000):
  """Extract from input path to target CBT selection."""

  tf.logging.info("Loading CBT table {}".format(selection.table_name))

  tf.logging.info("Processing file: {}".format(remote_file_path))

  filename = "-".join(remote_file_path.split("/")[-3:])

  local_file_path = generator_utils.maybe_download(tmp_dir, filename,
                                                   remote_file_path)

  audio_array = mp4_to_1d_array(local_file_path)

  # Re-sample every N steps (numpy slicing syntax)
  audio_array = audio_array[0::resample_every]

  audio_array = np.clip((audio_array + 0.5) * 255.0, a_min=0, a_max=255)

  # Read a frame iterable
  video = video_utils.Video()
  video.load_from_file(local_file_path,
                       downsample_size=(downsample_xy_dims, downsample_xy_dims),
                       greyscale=greyscale)

  selection.write_av(audio=audio_array,
                     frames=video,
                     shard_id=shard_id,
                     video_id=video_id,
                     audio_block_size=audio_block_size)