def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as f: self.train_set, self.valid_set = pickle_load(f) self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28), 'axes': ('batch', 'height', 'width')}, 'label': {'data': self.train_set[1], 'axes': ('batch',)}} self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28), 'axes': ('batch', 'height', 'width')}, 'label': {'data': self.valid_set[1], 'axes': ('batch',)}} return self.train_set, self.valid_set
def load_data(self): self.data_dict = {} self.vocab = None for phase in ['train', 'test', 'valid']: filename, filesize = self.filemap[phase]['filename'], self.filemap[phase]['size'] workdir, filepath = valid_path_append(self.path, '', filename) if not os.path.exists(filepath): fetch_file(self.url, filename, filepath, filesize) tokens = open(filepath).read() # add tokenization here if necessary if self.use_words: tokens = tokens.strip().split() self.vocab = sorted(set(tokens)) if self.vocab is None else self.vocab # vocab dicts self.token_to_index = dict((t, i) for i, t in enumerate(self.vocab)) self.index_to_token = dict((i, t) for i, t in enumerate(self.vocab)) # map tokens to indices X = np.asarray([self.token_to_index[t] for t in tokens], dtype=np.uint32) if self.shift_target: y = np.concatenate((X[1:], X[:1])) else: y = X.copy() self.data_dict[phase] = {'inp_txt': X, 'tgt_txt': y} return self.data_dict
def load_data(self, test_split=0.2): self.data_dict = {} self.vocab = None workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.filesize) with open(filepath, 'rb') as f: X, y = pickle_load(f) X = preprocess_text(X, self.vocab_size) X = pad_sentences(X, pad_idx=self.pad_idx, pad_to_len=self.sentence_length, pad_from='left') if self.shuffle: indices = np.arange(len(y)) np.random.shuffle(indices) X = X[indices] y = np.asarray(y)[indices] # split the data X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] y_train = np.array(y_train) y_test = np.array(y_test) self.nclass = 1 + max(np.max(y_train), np.max(y_test)) self.data_dict['train'] = { 'review': { 'data': X_train, 'axes': ('batch', 'REC') }, 'label': { 'data': y_train, 'axes': ('batch', ) } } self.data_dict['valid'] = { 'review': { 'data': X_test, 'axes': ('batch', 'REC') }, 'label': { 'data': y_test, 'axes': ('batch', ) } } return self.data_dict
def load_data(self): self.data_dict = {} workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath) tokens = open(filepath).read() train_samples = int(self.train_split * len(tokens)) train = tokens[:train_samples] test = tokens[train_samples:] return train, test
def load_data(self): """ Fetch the CIFAR-10 dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) batchdir = os.path.join(workdir, 'cifar-10-batches-py') if not os.path.exists(os.path.join(batchdir, 'data_batch_1')): assert os.path.exists(filepath), "Must have cifar-10-python.tar.gz" with tarfile.open(filepath, 'r:gz') as f: f.extractall(workdir) train_batches = [os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6)] Xlist, ylist = [], [] for batch in train_batches: with open(batch, 'rb') as f: d = pickle_load(f) Xlist.append(d['data']) ylist.append(d['labels']) X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32) y_train = np.vstack(ylist).ravel() with open(os.path.join(batchdir, 'test_batch'), 'rb') as f: d = pickle_load(f) X_test, y_test = d['data'], d['labels'] X_test = X_test.reshape(-1, 3, 32, 32) self.train_set = {'image': {'data': X_train, 'axes': ('batch', 'channel', 'height', 'width')}, 'label': {'data': y_train, 'axes': ('batch',)}} self.valid_set = {'image': {'data': X_test, 'axes': ('batch', 'channel', 'height', 'width')}, 'label': {'data': np.array(y_test), 'axes': ('batch',)}} return self.train_set, self.valid_set
def load_data(self, path=".", subset='wiki_entities'): """ Fetch the Facebook WikiMovies dataset and load it to memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. Returns: tuple: knowledge base, entity list, training and test files are returned """ self.data_dict = {} self.vocab = None workdir, filepath = valid_path_append(path, '', self.filename) babi_dir_name = self.filename.split('.')[0] if subset == 'wiki-entities': subset_folder = 'wiki_entities' else: subset_folder = subset file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt' train_file = os.path.join(workdir, file_base.format('train')) test_file = os.path.join(workdir, file_base.format('test')) entity_file_path = babi_dir_name + '/knowledge_source/entities.txt' entity_file = os.path.join(workdir, entity_file_path) # Check for the existence of the entity file # If it isn't there then we know we need to fetch everything if not os.path.exists(entity_file): if license_prompt('WikiMovies', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) fetch_file(self.url, self.filename, filepath, self.size) knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \ + subset_folder + '_kb.txt' kb_file = os.path.join(workdir, knowledge_file_path) return entity_file, kb_file, train_file, test_file
def load_data(self, data_directory=None, manifest_file=None): """ Create a manifest file for the requested dataset. First downloads the dataset and extracts it, if necessary. Arguments: data_directory (str): Path to data directory. Defaults to <path>/<version> manifest_file (str): Path to manifest file. Defaults to <data_directory>/manifest.tsv Returns: Path to manifest file """ if manifest_file is None: if self.manifest_file is not None: manifest_file = self.manifest_file else: manifest_file = os.path.join(self.path, "manifest.tsv") if os.path.exists(manifest_file): return manifest_file # Download the file workdir, filepath = valid_path_append(self.path, '', self.source_file) if not os.path.exists(filepath): fetch_file(self.url, self.source_file, filepath) # Untar the file if data_directory is None: data_directory = os.path.join(self.path, self.version) if not os.path.exists(data_directory): print("Extracting tar file to {}".format(data_directory)) with contextlib.closing(tarfile.open(filepath)) as tf: tf.extractall(data_directory) # Ingest the file ingest_librispeech(data_directory, manifest_file) return manifest_file
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = 'dialog-babi-candidates.txt' self.kb_filename = 'dialog-babi-kb-all.txt' self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy' self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format(self.task + 1) else: self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt' self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt' self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy' self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task + 1) self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append( self.path, '', self.filename) if not os.path.exists(filepath): if license_prompt('bAbI-dialog', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) fetch_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split('.')[0] self.candidate_answer_filename = self.babi_dir_name + \ '/' + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + '/' + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt' train_file = os.path.join(self.workdir, task_name.format('trn')) dev_file = os.path.join(self.workdir, task_name.format('dev')) test_file_postfix = 'tst-OOV' if self.oov else 'tst' test_file = os.path.join( self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, 'r:gz') as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = 'dialog-babi-candidates.txt' self.kb_filename = 'dialog-babi-kb-all.txt' self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy' self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format( self.task + 1) else: self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt' self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt' self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy' self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task + 1) self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): if license_prompt('bAbI-dialog', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) fetch_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split('.')[0] self.candidate_answer_filename = self.babi_dir_name + \ '/' + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + '/' + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt' train_file = os.path.join(self.workdir, task_name.format('trn')) dev_file = os.path.join(self.workdir, task_name.format('dev')) test_file_postfix = 'tst-OOV' if self.oov else 'tst' test_file = os.path.join(self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, 'r:gz') as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file