def maybe_prepare_text(self, tmp_dir): """Download corpus if necessary, decompress, split into multiple text files. Args: tmp_dir: directory containing dataset. Returns: list of filepaths for local text files. """ compressed_filename = os.path.basename(self.corpus_url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) decompressed_filepath = compressed_filepath[:-4] split_file_prefix = decompressed_filepath + "-part-" split_filepattern = split_file_prefix + "?????" split_files = sorted(tf.gfile.Glob(split_filepattern)) if not split_files: if not tf.gfile.Exists(decompressed_filepath): if not tf.gfile.Exists(compressed_filepath): generator_utils.maybe_download( tmp_dir, compressed_filepath, self.corpus_url) assert not subprocess.call(["bunzip2", compressed_filepath]) assert tf.gfile.Exists(decompressed_filepath) assert not subprocess.call([ "split", "--line-bytes=4M", "--suffix-length=5", "--numeric-suffixes", decompressed_filepath, split_file_prefix]) split_files = sorted(tf.gfile.Glob(split_filepattern)) assert split_files return split_files
def _get_mnist(directory): """Download all MNIST files to directory unless they are there.""" for filename in [ _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME, _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME ]: generator_utils.maybe_download(directory, filename, _MNIST_URL + filename)
def _get_fashion_mnist(directory): """Download all FashionMNIST files to directory unless they are there.""" # Fashion mnist files have the same names as MNIST. # We must choose a separate name (by adding 'fashion-' prefix) in the tmp_dir. for filename in [ _MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME, _MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME ]: generator_utils.maybe_download(directory, _FASHION_MNIST_LOCAL_FILE_PREFIX + filename, _FASHION_MNIST_URL + filename)
def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_lines, lang2_lines = [], [] for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) if not os.path.exists(compressed_filepath): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath): mode = "r:gz" if "gz" in compressed_filepath else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if ".gz" in lang1_filepath: new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if ".gz" in lang2_filepath: new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file: with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: lang1_file_lines = lang1_file.readlines() lang2_file_lines = lang2_file.readlines() assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath lang1_lines.extend(lang1_file_lines) lang2_lines.extend(lang2_file_lines) write_chunk_size = 10000 assert len(lang1_lines) == len(lang2_lines) with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file: i = 0 while i <= len(lang1_lines): for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang1_file.write(line) i += 1 for line in lang1_lines[i * write_chunk_size:]: lang1_file.write(line) with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file: i = 0 while i <= len(lang2_lines): for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]: lang2_file.write(line) i += 1 for line in lang2_lines[i * write_chunk_size:]: lang2_file.write(line) return filename
def _maybe_download_corpus(tmp_dir): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. """ corpus_url = ("http://www.statmt.org/lm-benchmark/" "1-billion-word-language-modeling-benchmark-r13output.tar.gz") corpus_filename = os.path.basename(corpus_url) corpus_filepath = os.path.join(tmp_dir, corpus_filename) if not os.path.exists(corpus_filepath): generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url) with tarfile.open(corpus_filepath, "r:gz") as corpus_tar: corpus_tar.extractall(tmp_dir)
def load_examples(tmp_dir, prop_train=0.09, prop_val=0.01): """Loads exampls from the tsv file. Args: tmp_dir: temp directory. prop_train: proportion of the train data prop_val: proportion of the validation data Returns: All examples in the dataset pluse train, test, and development splits. """ infile = generator_utils.maybe_download(tmp_dir, _TAR, _URL) tf.logging.info('Loading examples') all_examples = [] for i, d in enumerate(csv.DictReader(gzip.open(infile), delimiter='\t')): if i % 100000 == 0: tf.logging.info('%d examples have been loaded....' % i) ex = {x: int(y) if y.isdigit() else y for x, y in d.items()} all_examples.append(ex) random.seed(1) random.shuffle(all_examples) n_train = int(len(all_examples) * prop_train) n_val = n_train + int(len(all_examples) * prop_val) train = all_examples[:n_train] val = all_examples[n_train:n_val] test = [] for e in all_examples[n_val:]: if e['n_intervening'] == e['n_diff_intervening']: test.append(e) return all_examples, train, val, test
def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename): """Downloading and preparing the dataset. Args: tmp_dir: tem directory data_dir: data directory vocab_size: size of vocabulary vocab_filename: name of vocab file """ if not tf.gfile.Exists(data_dir): tf.gfile.MakeDirs(data_dir) file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL) tar_all = tarfile.open(file_path) tar_all.extractall(tmp_dir) tar_all.close() tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar")) tar_train.extractall(tmp_dir) tar_train.close() vocab_path = os.path.join(data_dir, vocab_filename) if not tf.gfile.Exists(vocab_path): with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile: reader = csv.reader(infile, delimiter="\t") words = [row[0] for row in reader] words = [_UNK] + words[:vocab_size] with tf.gfile.GFile(vocab_path, "w") as outfile: outfile.write("\n".join(words))
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(None) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 yield { "waveforms": audio_encoder.encode(media_file), "targets": text_encoder.encode(text_data) }
def generate_samples(self, data_dir, tmp_dir, dataset_split): """A generator to return data samples.Returns the data generator to return. Args: data_dir: A string representing the data directory. tmp_dir: A string representing the temporary directory and is used to download files if not already available. dataset_split: Train, Test or Eval. Yields: Each element yielded is of a Python dict of the form {"inputs": "STRING", "targets": "STRING"} """ # TODO(sanyamkapoor): Manually separate train/eval data set. csv_file_names = self.pair_files_list csv_files = [ generator_utils.maybe_download(tmp_dir, file_list[0], uri) for uri, file_list in csv_file_names ] for pairs_file in csv_files: tf.logging.debug("Reading {}".format(pairs_file)) with open(pairs_file, "r") as csv_file: for line in csv_file: reader = csv.reader(StringIO(line)) for docstring_tokens, function_tokens in reader: yield {"inputs": docstring_tokens, "targets": function_tokens}
def _maybe_download_corpus(tmp_dir, vocab_type): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. vocab_type: which vocabulary are we using. Returns: The list of names of files. """ filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, PTB_URL) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: files = [] # Selecting only relevant files. for m in tgz.getmembers(): if "ptb" in m.name and ".txt" in m.name: if "char" in m.name: ptb_char_files += [m.name] else: ptb_files += [m.name] files += [m] tgz.extractall(tmp_dir, members=files) if vocab_type == text_problems.VocabType.CHARACTER: return ptb_char_files else: return ptb_files
def _maybe_download_corpus(tmp_dir): """Download corpus if necessary. Args: tmp_dir: directory containing dataset. Returns: filepath of the downloaded corpus file. """ corpus_url = ("https://dumps.wikimedia.org/enwiki/20170620/" "enwiki-20170620-pages-articles-multistream.xml.bz2") corpus_filename = os.path.basename(corpus_url) corpus_filepath = os.path.join(tmp_dir, corpus_filename) if not tf.gfile.Exists(corpus_filepath): generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url) return corpus_filepath
def _get_mscoco(directory): """Download and extract MSCOCO datasets to directory unless it is there.""" for url in _MSCOCO_URLS: filename = os.path.basename(url) download_url = os.path.join(_MSCOCO_ROOT_URL, url) path = generator_utils.maybe_download(directory, filename, download_url) unzip_dir = os.path.join(directory, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): zipfile.ZipFile(path, "r").extractall(directory)
def _get_vqa_v2_image_raw_dataset(directory, image_root_url, image_urls): """Extract the VQA V2 image data set to directory unless it's there.""" for url in image_urls: filename = os.path.basename(url) download_url = os.path.join(image_root_url, url) path = generator_utils.maybe_download(directory, filename, download_url) unzip_dir = os.path.join(directory, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): zipfile.ZipFile(path, "r").extractall(directory)
def _download_and_parse_dataset(tmp_dir, train): """Downloads and prepairs the dataset to be parsed by the data_generator.""" file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL) zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(tmp_dir) zip_ref.close() file_name = 'train' if train else 'dev' dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name) _parse_dataset(dataset_file_path, tmp_dir, train)
def _maybe_download_corpora(self, tmp_dir): qnli_filename = "QNLI.zip" qnli_finalpath = os.path.join(tmp_dir, "QNLI") if not tf.gfile.Exists(qnli_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, qnli_filename, self._QNLI_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return qnli_finalpath
def _maybe_download_corpora(self, tmp_dir): scitail_filename = "SciTailV1.1.zip" scitail_finalpath = os.path.join(tmp_dir, "SciTailV1.1") if not tf.gfile.Exists(scitail_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, scitail_filename, self._SCITAIL_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return scitail_finalpath
def _maybe_download_corpora(self, tmp_dir): cola_filename = "CoLA.zip" cola_finalpath = os.path.join(tmp_dir, "CoLA") if not tf.gfile.Exists(cola_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, cola_filename, self._COLA_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return cola_finalpath
def _maybe_download_corpora(self, tmp_dir): sst_binary_filename = "SST-2.zip" sst_binary_finalpath = os.path.join(tmp_dir, "SST-2") if not tf.gfile.Exists(sst_binary_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, sst_binary_filename, self._SST2_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return sst_binary_finalpath
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
def _maybe_download_corpora(tmp_dir, dataset_split): """Download corpora if necessary and unzip them. Args: tmp_dir: directory containing dataset. dataset_split: whether we're in train/dev/test mode. Returns: List of all files generated and path to file containing train/dev/test split info. """ cnn_filename = "cnn_stories.tgz" cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/") dailymail_filename = "dailymail_stories.tgz" dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/") if not tf.gfile.Exists(cnn_finalpath): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: cnn_tar.extractall(tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: dailymail_tar.extractall(tmp_dir) cnn_files = tf.gfile.Glob(cnn_finalpath + "*") dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*") all_files = cnn_files + dailymail_files if dataset_split == problem.DatasetSplit.TRAIN: urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS) elif dataset_split == problem.DatasetSplit.EVAL: urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS) else: urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt", _TEST_URLS) return all_files, urls_path
def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. This is important for comparing with published results. Args: tmp_dir: directory containing dataset. Returns: a set of strings """ vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/" "vocab-2016-09-10.txt") vocab_filename = os.path.basename(vocab_url + ".en") vocab_filepath = os.path.join(tmp_dir, vocab_filename) if not os.path.exists(vocab_filepath): generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url) return set([ text_encoder.native_to_unicode(l.strip()) for l in tf.gfile.Open(vocab_filepath) ])
def testMaybeDownload(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Download Google index to the temporary file.http. res_path = generator_utils.maybe_download(tmp_dir, tmp_file_name + ".http", "http://google.com") self.assertEqual(res_path, tmp_file_path + ".http") # Clean up. os.remove(tmp_file_path + ".http") os.remove(tmp_file_path)
def maybe_download_dataset(self, tmp_dir, dataset_split): """Downloads the appropriate dataset file and returns its path.""" # Get the dataset url for the split requested. url = self.DATA_URLS.get(dataset_split, None) # Sanity check. if url is None: tf.logging.fatal("Unknown dataset_split passed: {}".format(dataset_split)) # Download the data, if it doesn't already exist. return generator_utils.maybe_download(tmp_dir, self._extract_filename_from_url(url), url)
def generate_data(self, data_dir, tmp_dir, task_id=-1): try: # Download source data if download_url specified h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, self.download_url) except NotImplementedError: # Otherwise, look for it locally h5_filepath = os.path.join(tmp_dir, self.h5_file) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, self.num_shards, "train", num_train_examples), (self.dev_filepaths, 10, "valid", num_dev_examples), (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process( target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], self.chunk_size, start_idx, end_idx)) processes.append(p) # 1 per training shard + 10 for dev + 10 for test assert len(processes) == self.num_shards + 20 # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) for i in range(num_batches): start = i * MAX_CONCURRENT_PROCESSES end = start + MAX_CONCURRENT_PROCESSES current = processes[start:end] for p in current: p.start() for p in current: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def testMaybeDownload(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Download Google index to the temporary file.http. res_path = generator_utils.maybe_download(tmp_dir, tmp_file_name + ".http", "http://google.com") self.assertEqual(res_path, tmp_file_path + ".http") # Clean up. os.remove(tmp_file_path + ".http") os.remove(tmp_file_path)
def generate_data(self, data_dir, tmp_dir, task_id=-1): try: # Download source data if download_url specified h5_filepath = generator_utils.maybe_download( tmp_dir, self.h5_file, self.download_url) except NotImplementedError: # Otherwise, look for it locally h5_filepath = os.path.join(tmp_dir, self.h5_file) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, self.num_shards, "train", num_train_examples), (self.dev_filepaths, 10, "valid", num_dev_examples), (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process(target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], self.chunk_size, start_idx, end_idx)) processes.append(p) # 1 per training shard + 10 for dev + 10 for test assert len(processes) == self.num_shards + 20 # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) for i in range(num_batches): start = i * MAX_CONCURRENT_PROCESSES end = start + MAX_CONCURRENT_PROCESSES current = processes[start:end] for p in current: p.start() for p in current: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Downloads and extracts the dataset and generates examples. Args: data_dir: The base directory where data and vocab files are stored. tmp_dir: temp directory to download and extract the dataset. dataset_split: split of the data-set. Yields: The data examples. """ # Create directories if needed. if not tf.gfile.Exists(tmp_dir): tf.gfile.MakeDirs(tmp_dir) if not tf.gfile.Exists(data_dir): tf.gfile.MakeDirs(data_dir) # Download and extract the data. filename = os.path.basename(_URL) path = generator_utils.maybe_download(tmp_dir, filename, _URL) tarfile.open(path, "r:gz").extractall(tmp_dir) # Create the list of directories with data files. train_dirs = [ "v1.0/train-easy", "v1.0/train-medium", "v1.0/train-hard" ] eval_dirs = ["v1.0/interpolate", "v1.0/extrapolate"] dirs = eval_dirs if dataset_split == problem.DatasetSplit.TRAIN: dirs = train_dirs dirs = [os.path.join(tmp_dir, d) for d in dirs] # Iterate over directories and files generating examples. for d in dirs: files = tf.gfile.Glob(d + "/*.txt") for fname in files: # In each text file, the first line is the input, the next the answer, # and so on until the end of the file. cur_input = None with tf.gfile.Open(fname, "rb") as f: for line in f: if cur_input is None: cur_input = line.strip() else: yield { "inputs": cur_input, "targets": line.strip() } cur_input = None
def _get_vqa_v2_dataset(directory): """Extract the VQA V2 data set to directory unless it's there.""" for url in _MSCOCO_IMAGE_URLS: filename = os.path.basename(url) download_url = os.path.join(_MSCOCO_ROOT_URL, url) path = generator_utils.maybe_download(directory, filename, download_url) unzip_dir = os.path.join(directory, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): zipfile.ZipFile(path, "r").extractall(directory) annotation_file = generator_utils.maybe_download_from_drive( directory, "vqa_v2.tar.gz", _VQA_V2_ANNOTATION_URL) with tarfile.open(annotation_file, "r:gz") as annotation_tar: annotation_tar.extractall(directory)
def maybe_download_and_unzip(self, tmp_dir): """Downloads deepsea data if it doesn"t already exist. Args: tmp_dir: String. The directory to maybe download to. Returns: String. The directory path where the unprocessed data was downloaded and unzipped. """ url = ("http://deepsea.princeton.edu/media/code/" "deepsea_train_bundle.v0.9.tar.gz") filename = "deepsea_train_bundle.v0.9.tar.gz" generator_utils.maybe_download(tmp_dir, filename, url) dirpath = os.path.join(tmp_dir, "deepsea_train") if not os.path.exists(dirpath): tf.logging.info( f"Extracting archive {filename} to directory: {dirpath}") filepath = os.path.join(tmp_dir, filename) tarfile.open(filepath, "r:gz").extractall(tmp_dir) else: tf.logging.info( f"Not extracting archive, directory already found: {dirpath}") return dirpath
def _maybe_download(self, tmp_dir, dataset_split): filename = os.path.basename(_MS_COCO_ZIPPED_FILE) download_url = os.path.join(_MS_COCO_DOWNLOAD_URL, filename) path = generator_utils.maybe_download(tmp_dir, filename, download_url) unzip_dir = os.path.join(tmp_dir, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): tf.logging.info("Unzipping data to {}".format(unzip_dir)) zipfile.ZipFile(path, "r").extractall(unzip_dir) if dataset_split == problem.DatasetSplit.TRAIN: ms_coco_file = _MS_COCO_TRAIN_FILE else: ms_coco_file = _MS_COCO_DEV_FILE ms_coco_path = os.path.join(unzip_dir, "annotations", ms_coco_file) return ms_coco_path
def _maybe_download(self, tmp_dir, dataset_split): filename = os.path.basename(_MS_COCO_ZIPPED_FILE) download_url = os.path.join(_MS_COCO_DOWNLOAD_URL, filename) path = generator_utils.maybe_download(tmp_dir, filename, download_url) unzip_dir = os.path.join(tmp_dir, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): tf.logging.info("Unzipping data to {}".format(unzip_dir)) zipfile.ZipFile(path, "r").extractall(unzip_dir) if dataset_split == problem.DatasetSplit.TRAIN: ms_coco_file = _MS_COCO_TRAIN_FILE else: ms_coco_file = _MS_COCO_DEV_FILE ms_coco_path = os.path.join(unzip_dir, "annotations", ms_coco_file) return ms_coco_path
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download( tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(data_dir) encoders = self.feature_encoders(None) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
def generate_samples(self, data_dir, tmp_dir, dataset_split): url = self._URL file_name = self._DEV_SET if dataset_split == problem.DatasetSplit.TRAIN: file_name = self._TRAINING_SET squad_file = generator_utils.maybe_download( tmp_dir, file_name, os.path.join(url, file_name)) with tf.gfile.GFile(squad_file, mode="r") as fp: squad = json.load(fp) version = squad["version"] for article in squad["data"]: if "title" in article: title = article["title"].strip() else: title = "no title" for paragraph in article["paragraphs"]: context = paragraph["context"].strip() for qa in paragraph["qas"]: question = qa["question"].strip() id_ = qa["id"] answer_starts = [ answer["answer_start"] for answer in qa["answers"] ] answers = [ answer["text"].strip() for answer in qa["answers"] ] # Features currently used are "context", "question", and "answers". # Others are extracted here for the ease of future expansions. example = { "version": version, "title": title, "context": context, "question": question, "id": id_, "answer_starts": answer_starts, "answers": answers, "num_answers": len(answers), "is_supervised": True, } yield { "inputs": example["question"], # TODO(ddohan, wgaj): Figure out a way of extracting all answers. "targets": example["answers"][0], "context": example["context"] }
def _generate_examples(tmp_dir, dataset_split): """Generate squad examples. Args: tmp_dir: a string dataset_split: problem.DatasetSplit.TRAIN or problem.DatasetSplit.EVAL Yields: dictionaries representing examples """ if dataset_split == problem.DatasetSplit.TRAIN: file_name = _TRAINING_SET else: file_name = _DEV_SET squad_file = generator_utils.maybe_download(tmp_dir, file_name, os.path.join(_URL, file_name)) with tf.gfile.GFile(squad_file, mode="r") as fp: squad = json.load(fp) version = squad["version"] for article in squad["data"]: if "title" in article: title = article["title"].strip() else: title = "no title" for paragraph in article["paragraphs"]: context = paragraph["context"].strip() for qa in paragraph["qas"]: question = qa["question"].strip() id_ = qa["id"] answer_starts = [ answer["answer_start"] for answer in qa["answers"] ] answers = [answer["text"].strip() for answer in qa["answers"]] # Features currently used are "context", "question", and "answers". # Others are extracted here for the ease of future expansions. example = { "version": version, "title": title, "context": context, "question": question, "id": id_, "answer_starts": answer_starts, "answers": answers, "num_answers": len(answers), "is_supervised": True, } yield example
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download( tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(raw_data_dir, "flac", "txt") data_pairs = data_files.values() encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for utt_id, media_file, text_data in sorted( data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) spk_id, unused_book_id, _ = utt_id.split("-") yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": [spk_id], }
def generate_samples(self, data_dir, tmp_dir, dataset_split): if dataset_split == problem.DatasetSplit.TRAIN: urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1]) else: urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1]) urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1]) for url in urls: path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url) for frame_number, frame, state, action in self.parse_frames(path): yield { "frame_number": [frame_number], "frame": frame, "state": state, "action": action, }
def generate_samples(self, data_dir, tmp_dir, dataset_split): if dataset_split == problem.DatasetSplit.TRAIN: urls = self.get_urls(DATA_TRAIN[0], DATA_TRAIN[1]) else: urls = self.get_urls(DATA_TEST_SEEN[0], DATA_TEST_SEEN[1]) urls += self.get_urls(DATA_TEST_NOVEL[0], DATA_TEST_NOVEL[1]) for url in urls: path = generator_utils.maybe_download(tmp_dir, os.path.basename(url), url) for frame_number, frame, state, action in self.parse_frames(path): yield { "frame_number": [frame_number], "frame": frame, "state": state, "action": action, }
def generator(self, data_dir, tmp_dir, training, eos_list=None, start_from=0, how_many=0): eos_list = [1] if eos_list is None else eos_list datasets = (_LIBRISPEECH_TRAIN_DATASETS if training else _LIBRISPEECH_TEST_DATASETS) num_reserved_ids = self.feature_encoders( None)["targets"].num_reserved_ids i = 0 for url, subdir in datasets: filename = os.path.basename(url) compressed_file = generator_utils.maybe_download( tmp_dir, filename, url) read_type = "r:gz" if filename.endswith("tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow members = [] for f in corpus_tar: if not os.path.isfile(os.path.join(tmp_dir, f.name)): members.append(f) corpus_tar.extractall(tmp_dir, members=members) data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir) data_files = _collect_data(data_dir, "flac", "txt") data_pairs = data_files.values() for media_file, text_data in sorted(data_pairs)[start_from:]: if how_many > 0 and i == how_many: return i += 1 audio_data, sample_count, sample_width, num_channels = _get_audio_data( media_file) label = [num_reserved_ids + ord(c) for c in text_data] + eos_list yield { "inputs": audio_data, "audio/channel_count": [num_channels], "audio/sample_count": [sample_count], "audio/sample_width": [sample_width], "targets": label }
def generator(self, data_dir, tmp_dir, datasets, eos_list=None, start_from=0, how_many=0): del eos_list i = 0 filename = os.path.basename(_COMMONVOICE_URL) compressed_file = generator_utils.maybe_download(tmp_dir, filename, _COMMONVOICE_URL) read_type = "r:gz" if filename.endswith(".tgz") else "r" with tarfile.open(compressed_file, read_type) as corpus_tar: # Create a subset of files that don't already exist. # tarfile.extractall errors when encountering an existing file # and tarfile.extract is extremely slow. For security, check that all # paths are relative. members = [ f for f in corpus_tar if _is_relative(tmp_dir, f.name) and not _file_exists(tmp_dir, f.name) ] corpus_tar.extractall(tmp_dir, members=members) raw_data_dir = os.path.join(tmp_dir, "cv_corpus_v1") data_tuples = _collect_data(raw_data_dir) encoders = self.feature_encoders(data_dir) audio_encoder = encoders["waveforms"] text_encoder = encoders["targets"] for dataset in datasets: data_tuples = (tup for tup in data_tuples if tup[0].startswith(dataset)) for utt_id, media_file, text_data in tqdm.tqdm( sorted(data_tuples)[start_from:]): if how_many > 0 and i == how_many: return i += 1 wav_data = audio_encoder.encode(media_file) yield { "waveforms": wav_data, "waveform_lens": [len(wav_data)], "targets": text_encoder.encode(text_data), "raw_transcript": [text_data], "utt_id": [utt_id], "spk_id": ["unknown"], }
def generate_samples(self, data_dir, tmp_dir, dataset_split): url = self._URL file_name = (self._TRAINING_SET if dataset_split == problem.DatasetSplit.TRAIN else self._DEV_SET) squad_file = generator_utils.maybe_download( tmp_dir, file_name, os.path.join(url, file_name)) with tf.gfile.GFile(squad_file, mode='r') as fp: squad = json.load(fp) version = squad['version'] for article in squad['data']: if 'title' in article: title = article['title'].strip() else: title = 'no title' for paragraph in article['paragraphs']: context = paragraph['context'].strip() for qa in paragraph['qas']: question = qa['question'].strip() id_ = qa['id'] answer_starts = [ answer['answer_start'] for answer in qa['answers'] ] answers = [ answer['text'].strip() for answer in qa['answers'] ] # Features currently used are 'context', 'question', and 'answers'. # Others are extracted here for the ease of future expansions. example = { 'version': version, 'title': title, 'context': context, 'question': question, 'id': id_, 'answer_starts': answer_starts, 'answers': answers, 'num_answers': len(answers), 'is_supervised': True, } yield { 'inputs': example['question'], # TODO(ddohan, wgaj): Figure out a way of extracting all answers. 'targets': example['answers'][0], 'context': example['context'] }
def generate_samples(self, data_dir, tmp_dir, dataset_split): # pylint: disable=no-self-use,unused-argument """Returns a generator to return {"inputs": [text], "targets": [text]}.""" pair_csv_files = [ generator_utils.maybe_download(data_dir, filename, uri) for uri, filename in self.source_data_files(dataset_split) ] for pairs_file in pair_csv_files: with open(pairs_file, 'r') as csv_file: pairs_reader = csv.reader(csv_file) for row in pairs_reader: function_tokens, docstring_tokens = row[-2:] yield { 'inputs': docstring_tokens, 'targets': function_tokens }
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" # punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' punctuation = '!#$%&\()*+-/:;.<=>?@[\\]^_`{|}' for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): sentences = sent_tokenize(doc) # if len(sentences) > 3: # sentence = sentences[:3] # # else: # sentence = sentences # # sentence = " ".join(sentence) sentence = sentences[:] sentence = ' '.join(sentence) tokens = word_tokenize(sentence) table = str.maketrans('', '', punctuation) stripped = [w.translate(table) for w in tokens] tokens = [word.lower() for word in stripped if word != 'br'] # 512 is the max len for the bert models if len(tokens) < self.MAX_LEN_SENT: # tokens = [word for word in stripped] doc = ' '.join(tokens) doc = re.sub('\'\'', '', doc).strip() doc = re.sub('\s+', ' ', doc).strip() yield { "inputs": doc, "label": int(label), }
def _maybe_download_corpora(tmp_dir): """Download corpora for multinli. Args: tmp_dir: a string Returns: a string """ mnli_filename = "MNLI.zip" mnli_finalpath = os.path.join(tmp_dir, "MNLI") if not tf.gfile.Exists(mnli_finalpath): zip_filepath = generator_utils.maybe_download( tmp_dir, mnli_filename, _MNLI_URL) zip_ref = zipfile.ZipFile(zip_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return mnli_finalpath
def generate_samples(self, data_dir, tmp_dir, dataset_split): compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) dir = os.path.join(tmp_dir, 'mtl-dataset') if not tf.gfile.Exists(dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
def generate_samples(self, data_dir, tmp_dir, dataset_split): url = self._URL file_name = (self._TRAINING_SET if dataset_split == problem.DatasetSplit.TRAIN else self._DEV_SET) squad_file = generator_utils.maybe_download(tmp_dir, file_name, os.path.join(url, file_name)) with tf.gfile.GFile(squad_file, mode='r') as fp: squad = json.load(fp) version = squad['version'] for article in squad['data']: if 'title' in article: title = article['title'].strip() else: title = 'no title' for paragraph in article['paragraphs']: context = paragraph['context'].strip() for qa in paragraph['qas']: question = qa['question'].strip() id_ = qa['id'] answer_starts = [answer['answer_start'] for answer in qa['answers']] answers = [answer['text'].strip() for answer in qa['answers']] # Features currently used are 'context', 'question', and 'answers'. # Others are extracted here for the ease of future expansions. example = { 'version': version, 'title': title, 'context': context, 'question': question, 'id': id_, 'answer_starts': answer_starts, 'answers': answers, 'num_answers': len(answers), 'is_supervised': True, } yield { 'inputs': example['question'], # TODO(ddohan, wgaj): Figure out a way of extracting all answers. 'targets': example['answers'][0], 'context': example['context'] }
def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", list_url) fsns_files = [ f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) elif "/validation/validation" in url: generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
def generate_data(self, data_dir, tmp_dir, task_id=-1): list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" "street/python/fsns_urls.txt") fsns_urls = generator_utils.maybe_download(tmp_dir, "fsns_urls.txt", list_url) fsns_files = [ f.strip() for f in open(fsns_urls, "r") if f.startswith("http://") ] for url in fsns_files: if "/train/train" in url: generator_utils.maybe_download( data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) elif "/validation/validation" in url: generator_utils.maybe_download( data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) elif "charset" in url: generator_utils.maybe_download(data_dir, "charset_size134.txt", url)
def _maybe_download_corpus(tmp_dir, vocab_type): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. vocab_type: which vocabulary are we using. Returns: The list of names of files. """ if vocab_type == text_problems.VocabType.CHARACTER: dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext" "/wikitext-103-raw-v1.zip") dir_name = "wikitext-103-raw" else: dataset_url = ("https://s3.amazonaws.com/research.metamind.io/wikitext" "/wikitext-103-v1.zip") dir_name = "wikitext-103" fname = os.path.basename(dataset_url) compressed_filepath = generator_utils.maybe_download( tmp_dir, fname, dataset_url) zip_ref = zipfile.ZipFile(compressed_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() files = os.path.join(tmp_dir, dir_name, "*") train_file, valid_file, test_file = None, None, None for f in tf.gfile.Glob(files): fname = os.path.basename(f) if "train" in fname: train_file = f elif "valid" in fname: valid_file = f elif "test" in fname: test_file = f assert train_file, "Training file not found" assert valid_file, "Validation file not found" assert test_file, "Testing file not found" return train_file, valid_file, test_file
def generate_samples(self, data_dir, tmp_dir, dataset_split): filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, PTB_URL) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: files = [] # Selecting only relevant files. for m in tgz.getmembers(): if "ptb" in m.name and ".txt" in m.name: if "char" in m.name: ptb_char_files += [m.name] else: ptb_files += [m.name] files += [m] tgz.extractall(tmp_dir, members=files) if self.vocab_type == text_problems.VocabType.CHARACTER: files = ptb_char_files else: files = ptb_files train_file, valid_file = None, None for filename in files: if "train" in filename: train_file = os.path.join(tmp_dir, filename) elif "valid" in filename: valid_file = os.path.join(tmp_dir, filename) assert train_file, "Training file not found" assert valid_file, "Validation file not found" _get_token_encoder(data_dir, self.vocab_filename, train_file) train = dataset_split == problem.DatasetSplit.TRAIN filepath = train_file if train else valid_file with tf.gfile.GFile(filepath, "r") as f: for line in f: line = " ".join(line.replace("\n", " %s " % EOS).split()) yield {"targets": line}
def _maybe_download_corpus(tmp_dir): """Download and unpack the corpus. Args: tmp_dir: directory containing dataset. Returns: path to entire corpus as a text file. """ corpus_url = "http://mattmahoney.net/dc/enwik8.zip" corpus_filename = os.path.basename(corpus_url) compressed_filepath = generator_utils.maybe_download( tmp_dir, corpus_filename, corpus_url) zip_ref = zipfile.ZipFile(compressed_filepath, "r") zip_ref.extractall(tmp_dir) zip_ref.close() return os.path.join(tmp_dir, "enwik8")
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) download_path = generator_utils.maybe_download(tmp_dir, compressed_filename, self.URL) imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { "inputs": doc, "label": int(label), }
def generator(self, data_dir, tmp_dir, train): filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, PTB_URL) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: files = [] # Selecting only relevant files. for m in tgz.getmembers(): if "ptb" in m.name and ".txt" in m.name: if "char" in m.name: ptb_char_files += [m.name] else: ptb_files += [m.name] files += [m] tgz.extractall(tmp_dir, members=files) if self.is_character_level: files = ptb_char_files else: files = ptb_files train_file, valid_file = None, None for filename in files: if "train" in filename: train_file = os.path.join(tmp_dir, filename) elif "valid" in filename: valid_file = os.path.join(tmp_dir, filename) assert train_file, "Training file not found" assert valid_file, "Validation file not found" if self.is_character_level: encoder = text_encoder.ByteTextEncoder() else: encoder = _get_token_encoder(data_dir, self.vocab_file, train_file) if train: return self._generator(train_file, encoder) return self._generator(valid_file, encoder)
def load_examples(tmp_dir, equalize_classes=False): """Loads exampls from the tsv file. Args: tmp_dir: temp directory. equalize_classes: if equalize number of examples in the classes. Returns: All examples in the dataset. """ infile = generator_utils.maybe_download(tmp_dir, _TAR, _URL) tf.logging.info('Loading examples') all_examples = [] for i, d in enumerate(csv.DictReader(gzip.open(infile), delimiter='\t')): if i % 100000 == 0: tf.logging.info('%d examples have been loaded....' % i) ex = {x: int(y) if y.isdigit() else y for x, y in d.items()} all_examples.append(ex) classes = defaultdict(list) for ex in all_examples: classes[ex['verb_pos']].append(ex) del all_examples[:] assert len(classes) == 2 c1 = classes.values()[0] c2 = classes.values()[1] random.seed(1) random.shuffle(c1) random.shuffle(c2) if equalize_classes: l = min(len(c1), len(c2)) all_examples = c1[:l] + c2[:l] else: all_examples = c1 + c2 random.shuffle(all_examples) return all_examples
def _prepare_babi_data(tmp_dir, data_dir): """Downloads and extracts the dataset. Args: tmp_dir: temp directory to download and extract the dataset data_dir: The base directory where data and vocab files are stored. Returns: tmp_dir: temp directory containing the raw data. """ if not tf.gfile.Exists(data_dir): tf.gfile.MakeDirs(data_dir) # TODO(dehghani@): find a solution for blocking user-agent (download) file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL) tar = tarfile.open(file_path) tar.extractall(tmp_dir) tar.close() return tmp_dir
def generate_data(self, data_dir, tmp_dir, num_shards=None): if num_shards is None: num_shards = 100 # Download source data h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file, self.download_url) with h5py.File(h5_filepath, "r") as h5_file: num_train_examples = h5_file["train_in"].len() num_dev_examples = h5_file["valid_in"].len() num_test_examples = h5_file["test_in"].len() # Collect all_filepaths to later shuffle all_filepaths = [] # Collect created shard processes to start and join processes = [] datasets = [(self.training_filepaths, num_shards, "train", num_train_examples), (self.dev_filepaths, 1, "valid", num_dev_examples), (self.test_filepaths, 1, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) for start_idx, end_idx, outfile in generate_shard_args( outfiles, num_examples): p = mp.Process(target=generate_dataset, args=(h5_filepath, key_prefix, [outfile], start_idx, end_idx)) processes.append(p) # Start and wait for processes assert len( processes) == num_shards + 2 # 1 per training shard + dev + test for p in processes: p.start() for p in processes: p.join() # Shuffle generator_utils.shuffle_dataset(all_filepaths)
def generate_samples(self, data_dir, tmp_dir, dataset_split): path = generator_utils.maybe_download( tmp_dir, os.path.basename(DATA_URL), DATA_URL) tar = tarfile.open(path) tar.extractall(tmp_dir) tar.close() if dataset_split == problem.DatasetSplit.TRAIN: base_dir = os.path.join(tmp_dir, "softmotion30_44k/train/*") else: base_dir = os.path.join(tmp_dir, "softmotion30_44k/test/*") filenames = tf.gfile.Glob(base_dir) for frame_number, frame, state, action in self.parse_frames(filenames): yield { "frame_number": [frame_number], "frame": frame, "state": state, "action": action, }
def download_and_extract_data(tmp_dir, dataset): """Download and Extract files.""" url = dataset[0] print(dataset) compressed_filename = os.path.basename(url) compressed_file = generator_utils.maybe_download(tmp_dir, compressed_filename, url) for file in dataset[1]: tf.logging.info("Reading file: %s" % file) filepath = os.path.join(tmp_dir, file) # Extract from tar if needed. if not tf.gfile.Exists(filepath): with tarfile.open(compressed_file, "r:gz") as corpus_tar: corpus_tar.extractall(tmp_dir) documents_filename, labels_filename = dataset[1] documents_filepath = os.path.join(tmp_dir, documents_filename) labels_filepath = os.path.join(tmp_dir, labels_filename) return documents_filepath, labels_filepath
def __init__(self, tmp_dir, data_dir, char=False): assert not char, "char mode for PTB is not yet implemented" self.char = char self.data_dir = data_dir url = PTB_URL filename = os.path.basename(url) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, url) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: files = [] # Selecting only relevant files. for m in tgz.getmembers(): if "ptb" in m.name and ".txt" in m.name: if "char" in m.name: ptb_char_files += [m.name] else: ptb_files += [m.name] files += [m] tgz.extractall(tmp_dir, members=files) if self.char: files = ptb_char_files else: files = ptb_files files = files for filename in files: if "train" in filename: self.train = os.path.join(tmp_dir, filename) elif "valid" in filename: self.valid = os.path.join(tmp_dir, filename) assert hasattr(self, "train"), "Training file not found" assert hasattr(self, "valid"), "Validation file not found" self.encoder = _get_token_encoder(data_dir, self.train)
def write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training): """Write text to files.""" def write_to_file(all_files, urls_path, tmp_dir, filename): with io.open(os.path.join(tmp_dir, filename + ".source"), "w") as fstory: with io.open(os.path.join(tmp_dir, filename + ".target"), "w") as fsummary: for example in example_generator(all_files, urls_path, sum_token=True): story, summary = _story_summary_split(example) fstory.write(story + "\n") fsummary.write(summary + "\n") filename = "cnndm.train" if is_training else "cnndm.dev" tf.logging.info("Writing %s" % filename) write_to_file(all_files, urls_path, tmp_dir, filename) if not is_training: test_urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt", _TEST_URLS) filename = "cnndm.test" tf.logging.info("Writing %s" % filename) write_to_file(all_files, test_urls_path, tmp_dir, filename)
def video_file_to_cbt(remote_file_path, selection, tmp_dir, shard_id, num_shards, video_id, downsample_xy_dims=64, greyscale=True, resample_every=2, audio_block_size=1000): """Extract from input path to target CBT selection.""" tf.logging.info("Loading CBT table {}".format(selection.table_name)) tf.logging.info("Processing file: {}".format(remote_file_path)) filename = "-".join(remote_file_path.split("/")[-3:]) local_file_path = generator_utils.maybe_download(tmp_dir, filename, remote_file_path) audio_array = mp4_to_1d_array(local_file_path) # Re-sample every N steps (numpy slicing syntax) audio_array = audio_array[0::resample_every] audio_array = np.clip((audio_array + 0.5) * 255.0, a_min=0, a_max=255) # Read a frame iterable video = video_utils.Video() video.load_from_file(local_file_path, downsample_size=(downsample_xy_dims, downsample_xy_dims), greyscale=greyscale) selection.write_av(audio=audio_array, frames=video, shard_id=shard_id, video_id=video_id, audio_block_size=audio_block_size)