def reverse(self, batch, limited=False): if self.use_revtok: try: import revtok except ImportError: print("Please install revtok.") raise if not self.batch_first: batch = batch.t() with torch.cuda.device_of(batch): batch = batch.tolist() batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize def trim(s, t): sentence = [] for w in s: if w == t: break sentence.append(w) return sentence batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos def filter_special(tok): return tok not in (self.init_token, self.pad_token) batch = [filter(filter_special, ex) for ex in batch] if self.use_revtok: return [revtok.detokenize(ex) for ex in batch] return [''.join(ex) for ex in batch]
def reverse(self, batch): if self.use_revtok: try: import revtok except ImportError: print("Please install revtok.") raise if not self.batch_first: batch = batch.t() with torch.cuda.device_of(batch): batch = batch.tolist() batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize def trim(s, t): sentence = [] for w in s: if w == t: break sentence.append(w) return sentence batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos def filter_special(tok): return tok not in (self.init_token, self.pad_token) batch = [filter(filter_special, ex) for ex in batch] if self.use_revtok: return [revtok.detokenize(ex) for ex in batch] return [' '.join(ex) for ex in batch]
def load_real_dataset(dataset_name): train_filename, valid_filename, test_filename = \ "{}_train.txt".format(dataset_name),\ "{}_valid.txt".format(dataset_name),\ "{}_test.txt".format(dataset_name) import random random.seed(42) print(train_filename, valid_filename, test_filename) TEXT = load(file_name=dataset_name + "_vocab.pkl", parent_path=DATASET_PATH) trn = LanguageModelingDataset(path=DATASET_PATH + train_filename, newline_eos=False, text_field=TEXT) vld = LanguageModelingDataset(path=DATASET_PATH + valid_filename, newline_eos=False, text_field=TEXT) tst = LanguageModelingDataset(path=DATASET_PATH + test_filename, newline_eos=False, text_field=TEXT) import revtok def denumericalize(batch): batch = [[TEXT.vocab.itos[ind] for ind in ex] for ex in batch.tolist()] def trim(s, t): sentence = [] for w in s: if w == t: break sentence.append(w) return sentence batch = [trim(ex, TEXT.eos_token) for ex in batch] # trim past frst eos def filter_special(tok): return tok not in (TEXT.init_token, TEXT.pad_token) batch = [list(filter(filter_special, ex)) for ex in batch] return batch TEXT.detokenize = lambda B: [revtok.detokenize(l) for l in B] TEXT.denumericalize = denumericalize TEXT.fix_length = TEXT.max_length + 1 lens = [len(x) for x in trn.text] print( 'vocab size: {}\ntrain size: {}\n valid size: {}\n test size: {}\n ' 'min length: {}\n max length: {}\n mean train length: {:.2f}, loaded max length: {}' .format(len(TEXT.vocab), len(trn), len(vld), len(tst), np.min(lens), np.max(lens), np.mean(lens), TEXT.max_length)) return trn, vld, tst, TEXT
def detokenize(tokens): words = [] for i, t in enumerate(tokens): if t['orig_id'] is None or (i and t['orig_id'] == tokens[i-1]['orig_id']): continue else: words.append(t['orig']) return revtok.detokenize(words)
def reverse(self, batch, src_data=None, att=None, dic_src=None): if self.use_revtok: try: import revtok except ImportError: print("Please install revtok.") raise if not self.batch_first: batch = batch.t() with torch.cuda.device_of(batch): batch = batch.tolist() batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize if att is not None: for i in range(len(batch)): for j in range(len(batch[i])): if batch[i][j] == '<unk>': _, maxIndex = att[j].max(0) batch[i][j] = dic_src.vocab.itos[src_data[maxIndex[0]]] def trim(s, t): sentence = [] for w in s: if w == t: break sentence.append(w+' ') return sentence batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos def filter_special(tok): return tok not in (self.init_token, self.pad_token) batch = [filter(filter_special, ex) for ex in batch] if self.use_revtok: return [revtok.detokenize(ex) for ex in batch] return [''.join(ex) for ex in batch]
def detokenize(self, tokenized, field_name=None): return revtok.detokenize(tokenized)
self.inverse_doc_freqs = idf def _term_freqs(self, doc): counter = Counter(doc) for token in doc: counter[token] /= len(doc) return counter if __name__ == '__main__': # Interactive testing for relevant memories retrieval import revtok from dataset import Dataset dataset = Dataset() kv_memory = KeyValueMemory(dataset) print('Interactive memory retrieval. {} to cancel\n'.format( colorize('Press CTRL + C', color='white'))) try: while True: query = revtok.tokenize(input('> ').strip()) queries, responses, _ = kv_memory.address(query) for key, value in zip(queries, responses): print('\nQ: {query}'.format(query=revtok.detokenize(key))) print( 'R: {response}'.format(response=revtok.detokenize(value))) print() except (KeyboardInterrupt, EOFError): print('\n\nShutting down')