def evaluate_input(searcher, word2idx, idx2word, device): tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS) to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS) to_tensor = ToTensor() transforms = [tokenizer, to_token_ids, to_tensor] previous = None while True: try: # Get input sentence input_sentence1 = input('> ') if input_sentence1 == 'q' or input_sentence1 == 'quit': break # Normalize sentence input_sentence1 = normalizeString(input_sentence1) # Evaluate sentence for t in transforms: input_sentence1 = t(input_sentence1) output_words = evaluate(searcher, idx2word, previous, input_sentence1, device) previous = input_sentence1 print(output_words) output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] print('Bot:', ' '.join(output_words)) except KeyError: print("Error: Encountered unknown word.")
def setup(self, stage=None): if self.setup_has_run: return super(PLDataModuleFromCorpus, self).setup(stage=stage) train_corpus, train_labels = zip(*self.train) # type: ignore val_corpus, val_labels = zip(*self.val) # type: ignore if not self.no_test_set: test_corpus, test_labels = zip(*self.test) # type: ignore self.train_corpus, self.val_corpus, self.test_corpus = self._create_corpora( train_corpus, val_corpus, test_corpus, self.corpus_args) to_tensor = ToTensor(device="cpu") if self.language_model: self.train = CorpusLMDataset(self.train_corpus).map(to_tensor) self.val = CorpusLMDataset(self.val_corpus).map(to_tensor) if not self.no_test_set: self.test = CorpusLMDataset(self.test_corpus).map(to_tensor) else: self.train = CorpusDataset(self.train_corpus, train_labels).map(to_tensor) self.val = CorpusDataset(self.val_corpus, val_labels).map(to_tensor) if not self.no_test_set: self.test = CorpusDataset(self.test_corpus, test_labels).map(to_tensor)
def __init__( self, data: List[Dict[str, Any]], modalities: Union[List[str], Set[str]] = {"text", "audio", "visual"}, text_is_tokens: bool = False, label_selector: Optional[Callable] = None, ): super(MOSEI, self).__init__(data, modalities) def default_label_selector(l): return l[0][0] if label_selector is None: label_selector = default_label_selector self.map(label_selector, "label", lazy=True) for m in self.modalities: if m == "text" and text_is_tokens: self.map(ToTensor(dtype=torch.long), m, lazy=True) else: self.map(ToTensor(dtype=torch.float), m, lazy=True)
def __init__( self, data: List[Dict[str, Any]], modalities: Union[List[str], Set[str]] = {"text", "audio", "visual"}, text_is_tokens: bool = False, binary: bool = False, ): super(MOSI, self).__init__(data, modalities) def label_selector(l): return l.item() self.map(label_selector, "label", lazy=True) if binary: self.map(binarize, "label", lazy=True) for m in self.modalities: if m == "text" and text_is_tokens: self.map(ToTensor(dtype=torch.long), m, lazy=True) else: self.map(ToTensor(dtype=torch.float), m, lazy=True)
else: word2idx, idx2word = word2idx_from_dataset( vocab_dict, most_freq=10000, extra_tokens=HRED_SPECIAL_TOKENS) embeddings = None emb_dim = options.emb_dim vocab_size = len(word2idx) print("Vocabulary size: {}".format(vocab_size)) # --- set dataset transforms --- tokenizer = DialogSpacyTokenizer(lower=True, prepend_sos=True, append_eos=True, specials=HRED_SPECIAL_TOKENS) to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS) to_tensor = ToTensor() dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor) print("Dataset size: {}".format(len(dataset))) import ipdb ipdb.set_trace() # --- make train and val loaders --- collator_fn = HRED_Collator(device='cpu') train_loader, val_loader = train_test_split(dataset, batch_train=BATCH_TRAIN_SIZE, batch_val=BATCH_VAL_SIZE, collator_fn=collator_fn, test_size=0.2) pad_index = word2idx[HRED_SPECIAL_TOKENS.PAD.value] sos_index = word2idx[HRED_SPECIAL_TOKENS.SOS.value]
directory='data/', train=True, dev=True, test=True, extracted_name='wikitext-2', url= 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', # noqa: E501 unknown_token=SPECIAL_TOKENS.UNK.value, eos_token=SPECIAL_TOKENS.EOS.value) vocab = create_vocab(train + dev, vocab_size=vocab_size, extra_tokens=SPECIAL_TOKENS.to_list()) replace_unk = ReplaceUnknownToken() to_token_ids = ToTokenIds(vocab) to_tensor = ToTensor(device='cpu') def create_dataloader(base): wrapped = (LMDataset(base, max_len=max_len).map(replace_unk).map( to_token_ids).map(to_tensor).apply_transforms()) return DataLoader(wrapped, batch_size=128, num_workers=1, pin_memory=True, collate_fn=collate_fn) train_loader = create_dataloader(train[:1000]) dev_loader = create_dataloader(dev[:1000]) test_loader = create_dataloader(test[:1000]) device = 'cuda' if torch.cuda.is_available() else 'cpu'
hidden_size = 300 epochs = 40 lexicons = False lex_size = 99 loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300) word2idx, idx2word, embeddings = loader.load() embeddings = torch.tensor(embeddings) with open("avec.pkl", "rb") as handle: _file = pickle.load(handle) tokenizer = SpacyTokenizer() replace_unknowns = ReplaceUnknownToken() to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device=DEVICE) train = AVECDataset(_file, max_word_length, transforms=Compose([ tokenizer, replace_unknowns, to_token_ids, to_tensor ]), split='train') dev = AVECDataset(_file, max_word_length, transforms=Compose([ tokenizer, replace_unknowns, to_token_ids, to_tensor ]), split='dev') test = AVECDataset(_file,
from slp.modules.transformer import Transformer from slp.util.pytorch import pad_mask, subsequent_mask simplefilter(action="ignore") pl.utilities.seed.seed_everything(42) collate_fn = Seq2SeqCollator(device="cpu") # All tokens are different. Should get 100% accuracy sentence = "The big brown fox jumps over the lazy dog".split(" ") vocab = create_vocab([sentence], vocab_size=-1, special_tokens=SPECIAL_TOKENS) vocab = dict(zip(vocab.keys(), itertools.count())) to_token_ids = ToTokenIds(vocab) to_tensor = ToTensor(device="cpu") class DummyDataset(Dataset): def __init__(self): self.data = [(sentence[0:-1], sentence[1:])] def __len__(self): return 1 def __getitem__(self, i): s, t = self.data[i] return to_tensor(to_token_ids(s)), to_tensor(to_token_ids(t))