def __init__(self, max_seq_len, train=False, val=False, test=False, root="data/", **kwargs): self.max_seq_len = max_seq_len self.dist_between_sents = int(self.max_seq_len / 10) self.is_train = train dataset = penn_treebank_dataset(root + "penn-treebank", train=train, dev=val, test=test) self.vocabulary = PennTreeBankDataset.get_vocabulary(root=root) self.index_to_word = {val: key for key, val in self.vocabulary.items()} words = [[]] for word_index, word in enumerate(dataset): if word == "</s>": words.append([]) else: if word in self.vocabulary: words[-1].append(self.vocabulary[word]) else: words[-1] += [self.vocabulary[c] for c in word] if word != "</s>": words[-1].append(self.vocabulary[" "]) self.data = [np.array(sent) for sent in words if (len(sent) != 0 and len(sent)<self.max_seq_len)] print("Length of dataset: ", len(self))
def initialize_dataset( corpus_name, max_word_length=None, max_words=None, data_splits=[0.7, 0.2, 0.1], ): print(f"Initializing dataset ..") assert abs(sum(data_splits) - 1) < 0.0001 if corpus_name == "penn-treebank": from torchnlp.datasets import penn_treebank_dataset train, val, test = penn_treebank_dataset( train=True, dev=True, test=True ) train = preprocess_sentence(train) val = preprocess_sentence(val) test = preprocess_sentence(test) datasets = { "train": train, "val": val, "test": test, } elif corpus_name == "brown": import nltk nltk.download("brown") from nltk.corpus import brown processed_txt = [] for s in brown.sents(): processed_txt += preprocess_sentence(s) n_tokens = len(processed_txt) split_n = [int(s * n_tokens) for s in np.cumsum(data_splits)] train = processed_txt[: split_n[0]] val = processed_txt[split_n[0] : split_n[1]] test = processed_txt[split_n[1] :] datasets = { "train": train, "val": val, "test": test, } else: raise ValueError(f"Corpus {corpus_name} not supported") root_path = os.path.join("data", corpus_name, "objects") if os.path.exists(root_path): shutil.rmtree(root_path) os.makedirs(root_path + "/train", exist_ok=True) os.makedirs(root_path + "/val", exist_ok=True) os.makedirs(root_path + "/test", exist_ok=True) return create_objects( corpus_name, datasets, root_path, max_word_length, max_words )
def create_vocabulary(root="data/"): if root is None: root = "" dataset = penn_treebank_dataset(root + "penn-treebank", train=True, dev=False, test=False) all_words = [w for w in dataset] vocabulary = list(set([c for w in all_words for c in w])) + [" ", "<unk>", "</s>"] vocabulary = sorted(vocabulary) vocabulary = {vocabulary[i]: i for i in range(len(vocabulary))} with open(root + PennTreeBankDataset.VOCABULARY_FILE, "w") as f: json.dump(vocabulary, f, indent=4)
def test_penn_treebank_dataset_row(mock_urlretrieve): mock_urlretrieve.side_effect = urlretrieve_side_effect # Check a row are parsed correctly train, dev, test = penn_treebank_dataset( directory=directory, test=True, dev=True, train=True, check_files=[]) assert len(train) > 0 assert len(test) > 0 assert len(dev) > 0 assert train[0:10] == [ 'aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec' ]
def sample_data(): return penn_treebank_dataset(dev=True)
def _build_dataloader(self): self.val_loader = self.corpus = None if self.dataset_kind == "mnist": transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) self.dataset = MNISTBufferedDataset(self.data_dir, download=True, train=True, transform=transform) self.val_dataset = MNISTBufferedDataset(self.data_dir, download=True, transform=transform) self.train_sampler = MNISTSequenceSampler( self.dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, randomize_sequence_cursors=self.randomize_sequence_cursors, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.batches_in_epoch, ) if self.static_digit: # For static digit paradigm, val & train samplers much # match to ensure same digit prototype used for each sequence item. self.val_sampler = self.train_sampler else: self.val_sampler = MNISTSequenceSampler( self.val_dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, randomize_sequence_cursors=self.randomize_sequence_cursors, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.eval_batches_in_epoch, ) self.train_loader = DataLoader( self.dataset, batch_sampler=self.train_sampler, collate_fn=pred_sequence_collate, ) self.val_loader = DataLoader( self.val_dataset, batch_sampler=self.val_sampler, collate_fn=pred_sequence_collate, ) elif self.dataset_kind == "ptb": # Download "Penn Treebank" dataset from torchnlp.datasets import penn_treebank_dataset print("Maybe download PTB...") penn_treebank_dataset(self.data_dir + "/PTB", train=True, test=True) corpus = lang_util.Corpus(self.data_dir + "/PTB") train_sampler = PTBSequenceSampler( corpus.train, batch_size=self.batch_size, max_batches=self.batches_in_epoch, ) if self.embedding_kind == "rsm_bitwise": embedding = lang_util.BitwiseWordEmbedding().embedding_dict elif self.embedding_kind in ["bpe", "glove"]: from torchnlp.word_to_vector import BPEmb, GloVe cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache" if self.embedding_kind == "bpe": vectors = BPEmb(dim=self.embed_dim, cache=cache_dir) else: vectors = GloVe(name="6B", dim=self.embed_dim, cache=cache_dir) embedding = {} for word_id, word in enumerate(corpus.dictionary.idx2word): embedding[word_id] = vectors[word] elif "ptb_fasttext" in self.embedding_kind: import fasttext # Generated via notebooks/ptb_embeddings.ipynb embedding = {} ft_model = fasttext.load_model(self.data_dir + "/embeddings/%s.bin" % self.embedding_kind) for word_id, word in enumerate(corpus.dictionary.idx2word): embedding[word_id] = torch.tensor(ft_model[word]) if self.embedding_kind: print("Loaded embedding dict (%s) with %d entries" % (self.embedding_kind, len(embedding))) collate_fn = partial(ptb_pred_sequence_collate, vector_dict=embedding) self.train_loader = DataLoader(corpus.train, batch_sampler=train_sampler, collate_fn=collate_fn) val_sampler = PTBSequenceSampler( corpus.test, batch_size=self.eval_batch_size, max_batches=self.eval_batches_in_epoch, uniform_offsets=True, ) self.val_loader = DataLoader(corpus.test, batch_sampler=val_sampler, collate_fn=collate_fn) self.corpus = corpus print("Built dataloaders...")
import torch from torchnlp.datasets import penn_treebank_dataset train = penn_treebank_dataset(train=True) print(train[:100])
def _build_dataloader(self): # Extra element for sequential prediction labels self.val_loader = None if self.dataset_kind == "mnist": transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) self.dataset = MNISTBufferedDataset(self.data_dir, download=True, train=True, transform=transform) self.val_dataset = MNISTBufferedDataset(self.data_dir, download=True, transform=transform) self.train_sampler = MNISTSequenceSampler( self.dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.batches_in_epoch, ) if self.static_digit: # For static digit paradigm, val & train samplers much # match to ensure same digit prototype used for each sequence item. self.val_sampler = self.train_sampler else: self.val_sampler = MNISTSequenceSampler( self.val_dataset, sequences=self.sequences, batch_size=self.batch_size, random_mnist_images=not self.static_digit, noise_buffer=self.noise_buffer, use_mnist_pct=self.use_mnist_pct, max_batches=self.batches_in_epoch, ) self.train_loader = DataLoader( self.dataset, batch_sampler=self.train_sampler, collate_fn=pred_sequence_collate, ) self.val_loader = DataLoader( self.val_dataset, batch_sampler=self.val_sampler, collate_fn=pred_sequence_collate, ) elif self.dataset_kind == "ptb": # Download "Penn Treebank" dataset from torchnlp.datasets import penn_treebank_dataset penn_treebank_dataset(self.data_dir + "/PTB", train=True) corpus = lang_util.Corpus(self.data_dir + "/PTB") train_sampler = PTBSequenceSampler( corpus.train, batch_size=self.batch_size, max_batches=self.batches_in_epoch, ) if self.embedding_kind == "rsm_bitwise": embedding = lang_util.BitwiseWordEmbedding().embedding_dict elif self.embedding_kind == "bpe": from torchnlp.word_to_vector import BPEmb cache_dir = self.data_dir + "/torchnlp/.word_vectors_cache" vectors = BPEmb(dim=self.embed_dim, cache=cache_dir) embedding = {} for word_id, word in enumerate(corpus.dictionary.idx2word): embedding[word_id] = vectors[word] collate_fn = partial(ptb_pred_sequence_collate, vector_dict=embedding) self.train_loader = DataLoader(corpus.train, batch_sampler=train_sampler, collate_fn=collate_fn) val_sampler = PTBSequenceSampler( corpus.test, batch_size=self.batch_size, max_batches=self.batches_in_epoch, ) self.val_loader = DataLoader(corpus.test, batch_sampler=val_sampler, collate_fn=collate_fn)
# You should have received a copy of the GNU Affero Public License # along with this program. If not, see http://www.gnu.org/licenses. # # http://numenta.org/licenses/ import os import sys import fasttext from torchnlp.datasets import penn_treebank_dataset PATH = "/home/ubuntu" # PATH = "/Users/jgordon" print("Maybe download ptb...") penn_treebank_dataset(PATH + "/nta/datasets/PTB", train=True, test=True) PTB_TRAIN_PATH = PATH + "/nta/datasets/PTB/ptb.train.txt" if len(sys.argv) > 1: epoch = int(sys.argv[1]) else: epoch = 5 model = fasttext.train_unsupervised(PTB_TRAIN_PATH, model="skipgram", minCount=1, epoch=epoch) embed_dir = PATH + "/nta/datasets/embeddings" filename = PATH + "/nta/datasets/embeddings/ptb_fasttext_e%d.bin" % epoch if not os.path.exists(embed_dir):