dataset.threshold_data(13, tokenizer=tokenizer) dataset.trim_words(3, tokenizer=tokenizer) vocab_dict = dataset.create_vocab_dict(tokenizer) # load embeddings from file or set None (to be randomly init) if options.embeddings is not None: new_emb_file = './cache/new_embs.txt' old_emb_file = options.embeddings freq_words_file = './cache/freq_words.txt' emb_dim = options.emb_dim create_emb_file(new_emb_file, old_emb_file, freq_words_file, vocab_dict, most_freq=10000) word2idx, idx2word, embeddings = EmbeddingsLoader( new_emb_file, emb_dim, extra_tokens=HRED_SPECIAL_TOKENS).load() else: word2idx, idx2word = word2idx_from_dataset( vocab_dict, most_freq=10000, extra_tokens=HRED_SPECIAL_TOKENS) embeddings = None emb_dim = options.emb_dim vocab_size = len(word2idx) print("Vocabulary size: {}".format(vocab_size)) # --- set dataset transforms --- tokenizer = DialogSpacyTokenizer(lower=True, prepend_sos=True, append_eos=True, specials=HRED_SPECIAL_TOKENS) to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
trainer = Seq2SeqTrainer( model, optimizer, checkpoint_dir=None, # '../checkpoints', metrics=metrics, non_blocking=True, retain_graph=True, patience=5, device=device, loss_fn=criterion) return trainer import os if __name__ == '__main__': loader = EmbeddingsLoader('../cache/glove.6B.50d.txt', 50) word2idx, _, embeddings = loader.load() tokenizer = SpacyTokenizer() to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device='cpu') transforms = Compose([tokenizer, to_token_ids, to_tensor]) dataset = MovieCorpusDataset('../data/', transforms=transforms, train=True) #dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor) if KFOLD: cv_scores = [] import gc for train_loader, val_loader in kfold_split(dataset, 32, 128): trainer = trainer_factory(embeddings, device=DEVICE)
def load_embeddings(emb_file, emb_dim): loader = EmbeddingsLoader(emb_file, emb_dim, extra_tokens=HRED_SPECIAL_TOKENS) word2idx, idx2word, embeddings = loader.load() return word2idx, idx2word, embeddings
if __name__ == '__main__': ####### Parameters ######## batch_train = 8 batch_val = 8 max_sent_length = 500 #max number of sentences (turns) in transcript - after padding max_word_length = 122 #max length of each sentence (turn) - after padding num_classes = 2 batch_size = 8 hidden_size = 300 epochs = 40 lexicons = False lex_size = 99 loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) word2idx, idx2word, embeddings = loader.load() embeddings = torch.tensor(embeddings) with open("avec.pkl", "rb") as handle: _file = pickle.load(handle) tokenizer = SpacyTokenizer() replace_unknowns = ReplaceUnknownToken() to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device=DEVICE) train = AVECDataset(_file, max_word_length, transforms=Compose([ tokenizer, replace_unknowns, to_token_ids,
def __getitem__(self, idx): datum = self.dataset[idx] text, target = datum['text'], datum['sentiment'] target = self.label_encoder.transform([target])[0] for t in self.transforms: text = t(text) return text, target DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' collate_fn = SequenceClassificationCollator(device='cpu') if __name__ == '__main__': loader = EmbeddingsLoader('../cache/glove.840B.300d.txt', 300) word2idx, _, embeddings = loader.load() tokenizer = SpacyTokenizer() to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device='cpu') def create_dataloader(d): d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor)) return DataLoader(d, batch_size=32, num_workers=1, pin_memory=True, shuffle=True, collate_fn=collate_fn)
dataset = MovieCorpusDatasetv2('./data/', transforms=None) # Preprocess dataset MIN_COUNT= 3 MAX_LENGTH = 10 dataset.normalize_data() dataset.threshold_data(MAX_LENGTH, tokenizer=SpacyTokenizer()) dataset.trim_words(MIN_COUNT, tokenizer=SpacyTokenizer()) # Load embeddings emb_file = './cache/glove.6B.300d.txt' new_emb_file = './cache/new_embs.txt' create_emb_file(new_emb_file, emb_file, dataset.word2count) loader = EmbeddingsLoader(new_emb_file, 300, extra_tokens=SPECIAL_TOKENS) word2idx, idx2word, embeddings = loader.load() # receive sos,eos and pad tokens pad_index = word2idx[SPECIAL_TOKENS.PAD.value] bos_index = word2idx[SPECIAL_TOKENS.BOS.value] eos_index = word2idx[SPECIAL_TOKENS.EOS.value] # apply transforms to dataset tokenizer = SpacyTokenizer(append_eos=True, specials=SPECIAL_TOKENS) to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device='cpu') transforms = Compose([tokenizer, to_token_ids, to_tensor]) dataset.apply_transforms(transforms)