def evaluate_input(searcher, word2idx, idx2word, device): tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS) to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS) to_tensor = ToTensor() transforms = [tokenizer, to_token_ids, to_tensor] previous = None while True: try: # Get input sentence input_sentence1 = input('> ') if input_sentence1 == 'q' or input_sentence1 == 'quit': break # Normalize sentence input_sentence1 = normalizeString(input_sentence1) # Evaluate sentence for t in transforms: input_sentence1 = t(input_sentence1) output_words = evaluate(searcher, idx2word, previous, input_sentence1, device) previous = input_sentence1 print(output_words) output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] print('Bot:', ' '.join(output_words)) except KeyError: print("Error: Encountered unknown word.")
action='store_true', default=False, help='shared weights between encoder ' 'and decoder') parser.add_argument('-shared_emb', action='store_true', default=False, help='shared embedding layer') options = parser.parse_args() if options.pretraining is True: assert False, "you are using this script to fine tune the whole " \ "model! -pt should not be activated!" # --- read data to create vocabulary dict --- tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS) if options.dataset == "movie": dataset = MovieCorpusDatasetTriples('./data/', transforms=None) elif options.dataset == "dailydialog": dataset = DailyDialogDataset('./data/ijcnlp_dailydialog', transforms=None) elif options.dataset == "semaine": dataset = SemaineDatasetTriplesOnly( "./data/semaine-database_download_2020-01-21_11_41_49") else: assert False, "Specify dataset used in options (movie, dailydialog or" \ "semaine)" dataset.normalize_data() if options.preprocess:
def map(self, t): if self.transforms is None: self.transforms = [] self.transforms.append(t) return self def __len__(self): return len(self.triples) def __getitem__(self, idx): s1, s2, s3 = self.triples[idx] if self.transforms is not None: for t in self.transforms: s1 = t(s1) s2 = t(s2) s3 = t(s3) return s1, s2, s3 if __name__ == '__main__': dataset = DailyDialogDataset('./data/ijcnlp_dailydialog', transforms=None) tokenizer = DialogSpacyTokenizer(lower=True, specials=HRED_SPECIAL_TOKENS) #dataset.normalize_data() #dataset.threshold_data(12, tokenizer=tokenizer) #dataset.trim_words(3, tokenizer=tokenizer) vocab_dict = dataset.create_vocab_dict(tokenizer) import ipdb ipdb.set_trace()