def main(): args = parse_args() if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) src_vocab = Vocabulary('<unk>', eos_symbol='</s>') tgt_vocab = Vocabulary('<unk>', sos_symbol='<s>', eos_symbol='</s>') train = list( read_bitext(src_vocab, tgt_vocab, args.train_src, args.train_tgt)) src_vocab.freeze() tgt_vocab.freeze() dev = list(read_bitext(src_vocab, tgt_vocab, args.dev_src, args.dev_tgt)) # init model model = Seq2SeqAtt(src_vocab, tgt_vocab, args.src_embed_dim, args.tgt_embed_dim, args.enc_nlayers, args.enc_hidden_dim, args.dec_nlayers, args.dec_hidden_dim, args.attention_dim, args.label_smoothing) if args.saved_model: model.load_model(args.saved_model) if args.only_decode: print("Reading test data...") test = list( read_bitext(src_vocab, tgt_vocab, args.test_src, args.test_tgt)) model.translate(test, args.beam_size, args.max_output_len, args.length_norm, args.output_file, args.relative, args.absolute, args.local, args.candidate) print("Done") else: training_procedure = BasicTrainingProcedure( model, dy.SimpleSGDTrainer(model.pc)) training_procedure.train(args.epochs, train, dev, args.batch_size, args.batch_size, args.max_output_len)
def load_word_data(questions_df, image_captions, exclude_word_list): vocab = Vocabulary() answers = Vocabulary(first_word="RELEVANT") specific_answers = Vocabulary() question_seq_length = 1 caption_seq_length = 1 print "Generating vocabulary and answer indices..." new_questions = [] for _, row in questions_df.iterrows(): question_words = row['question'].split(' ') if len(question_words) > question_seq_length: question_seq_length = len(question_words) all_words = question_words image_file = row['image_file'] if image_file in image_captions: caption = image_captions[image_file] caption_words = caption.split(' ') if len(caption_words) > caption_seq_length: caption_seq_length = len(caption_words) all_words += caption_words for word in all_words: if len(word) > 0 and word not in exclude_word_list: vocab.add_word(word) # if row['relevant'] == 0: answers.add_word(row['answer']) specific_answers.add_word(row['specific_answer']) print '\tVocab count: [%d]' % (len(vocab)) print '\tAnswers count: [%d]' % (len(answers)) print '\tQuestion sequence length: [%d]' % (question_seq_length) print '\tCaption sequence length: [%d]' % (caption_seq_length) print "Loading word vectors..." word_to_vector = load_word_vectors(word_vectors_file, vocab) print 'Creating embedding matrix...' embedding_matrix = np.zeros((len(vocab), embedding_dim)) words_not_found = [] for word, i in vocab.word_index.items(): if word not in word_to_vector: words_not_found.append(word) continue embedding_vector = word_to_vector[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector if len(words_not_found) > 0: print "Words not found:", "\n\t", words_not_found for word in words_not_found: del vocab.index_word[vocab.word_index[word]] return vocab, answers, specific_answers, embedding_matrix, word_to_vector, question_seq_length, caption_seq_length
def __init__(self, data_dir, mode, vocab_size): self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv')) self.sentences = self.df['text'].values self.labels = self.df['label'].values # Initialize dataset Vocabulary object and build our vocabulary self.sentences_vocab = Vocabulary(vocab_size) self.labels_vocab = Vocabulary(vocab_size) self.sentences_vocab.build_vocabulary(self.sentences) self.labels_vocab.build_vocabulary(self.labels, add_unk=False)
def toShakespeare(self): """Given a line of text, return that text in the indicated style. Args: modern_text: (string) The input. Returns: string: The translated text, if generated. """ args = load_arguments() vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = Model(args, vocab) model.saver.restore(sess, args.model) if args.beam > 1: decoder = beam_search.Decoder(sess, args, vocab, model) else: decoder = greedy_decoding.Decoder(sess, args, vocab, model) batch = get_batch([self.modern_text], [1], vocab.word2id) ori, tsf = decoder.rewrite(batch) out = ' '.join(w for w in tsf[0]) return out
def main(): args = _parse_args() assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}" with io.open(args.corpus, mode="r") as corpus: v = Vocabulary(table_size=int(2E7)) v.create(corpus, [(args.n_vocab, args.n_min_freq, args.n_min_freq)]) print(f"finished. saving models: {args.model}") v.save(args.model) # sanity check print("done. now execute sanity check...") print(f"n_vocab: {len(v)}, total_freq:{sum(v.counts)}") s = "Knox County Health Department is following national Centers for Disease Control and Prevention Protocol to contain infection." print(f"sentence: {s}") s_tokenized = "/".join(v.tokenize(s, remove_oov=False)) print(f"tokenized: {s_tokenized}") print(f"random sampling...") n_sample = 100 x = v.random_ids(n_sample) w, f = np.unique(list(map(v.id2word, x)), return_counts=True) for idx in np.argsort(f)[::-1]: print(f"{w[idx]} -> {f[idx]}") print("finished. good-bye.")
def train(): with open('train_config.json') as train_config_file: train_config = json.load(train_config_file) train_data_path = train_config['train_data_path'] test_data_path = train_config['test_data_path'] vocab_path = train_config['vocab_path'] train_input_data, train_input_label = load_corpus( file_path=train_data_path, make_vocab=True, vocab_path=vocab_path) val_input_data, val_input_label = load_corpus(file_path=test_data_path, make_vocab=False) vocab = Vocabulary(vocab_path) model = Spacing(vocab_len=len(vocab)) print(model) trainer = Trainer(model=model, vocab=vocab, train_data=train_input_data, train_label=train_input_label, val_data=val_input_data, val_label=val_input_label, config=train_config) trainer.train(total_epoch=10, validation_epoch=1)
def ngrams(prefix): """ Find n-grams and make a vocabulary from the parsed corpus """ with BZ2File(prefix + 'corpus.bz2', 'r') as corpus: vocab = Vocabulary(build_table=False) vocab.create(corpus, [(75000, 350), (25000, 350), (10000, 350)]) vocab.save(prefix + 'vocab.gz')
def read_vocabs(self, datafile, corpus_name): lines = open(datafile, encoding="utf-8").read().strip().split('\n') pairs = [[self.normalize_string(s) for s in line.split('\t')] for line in lines] vocab = Vocabulary(corpus_name) return vocab, pairs
def prepare_data(): lines = open(config.TXT_DATA).read().strip().split('\n') pairs = [[snippet for snippet in line.split("$")] for line in lines] source_vocab = Vocabulary(config.SOURCE) target_vocab = Vocabulary(config.TARGET) for pair in pairs: source_vocab.add_sentence(pair[0]) target_vocab.add_sentence(pair[1]) random.shuffle(pairs) eval_pairs = pairs[:int(len(pairs) * config.EVAL_PERCENTAGE)] train_pairs = pairs[int(len(pairs) * config.EVAL_PERCENTAGE):] return source_vocab, target_vocab, train_pairs, eval_pairs
def __init__(self, data_path, vocab=Vocabulary(), predict=False): """ Creates an object that gets data from a file. """ super(Data, self).__init__(data_path, vocab) if not predict: self._train_test_split()
def main(): args = get_arguments() SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8'))) print(args) args.device = list (map(str,args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) # image transformer transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time()-begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(SETTING.method)) vis_file = os.path.join(savedir, "{}.png".format(SETTING.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=SETTING.method) plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
def main(): args = get_arguments() SETTING = Dict( yaml.safe_load( open(os.path.join('arguments', args.arg + '.yaml'), encoding='utf8'))) print(args) args.device = list(map(str, args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert SETTING.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( SETTING.checkpoint), flush=True) ckpt = torch.load(SETTING.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, args.image_path, imenc, transform) retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
def __init__(self, data_path, vocab=Vocabulary()): self.vocab = vocab data = get_requests_from_file(data_path) print("Downloaded {} samples".format(len(data))) map_result = map(self._process_request, data) self.data = [x[0] for x in map_result] self.lengths = [x[1] for x in map_result] assert len(self.data) == len(self.lengths)
def load_or_create_vocab(trainDataset=None, testDataset=None): Texts = list(trainDataset.anns.values()) + list(testDataset.anns.values()) if os.path.exists(VOCAB_FILE): print("loading vocab") vocab = torch.load(VOCAB_FILE) print("vocab loaded") return vocab else: vocab = Vocabulary() vocab.create_from_texts(Texts) return vocab
def __init__(self, csv_path, image_path, transform=None, batch_size=4): self.captionsfile = pd.read_csv(csv_path) self.image_path = image_path self.transform = transform self.vocab = Vocabulary(vocab_threshold=2) self.batch_size = batch_size all_tokens = [ nltk.tokenize.word_tokenize( str(self.captionsfile.iloc[index, 2]).lower()) for index in range(len(self.captionsfile)) ] self.caption_lengths = [len(token) for token in all_tokens]
def prepare_data(args): ''' Do all the job about preparing data. :param args: :return: ''' trainset = REDataset(args.trainset_path, double_data=args.is_double_training_data) testset = REDataset(args.testset_path) # make vocab vocab = Vocabulary(word_num=args.vocab_word_num) corpus = [] for example in trainset: corpus += example[0] if args.vocab_include_testset: for example in testset: corpus += example[0] vocab.add_from_corpus(corpus) # make label encoder all_labels = [] for example in trainset: all_labels.append(example[1]) label_encoder = LabelEncoder(all_labels) batch_maker = BatchMaker(vocab, label_encoder, max_length=args.max_length) traindata_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers, collate_fn=batch_maker.batch_packer) testdata_loader = DataLoader(testset, batch_size=args.test_batch_size, shuffle=args.test_shuffle, num_workers=args.num_workers, collate_fn=batch_maker.batch_packer) logger.info('trainset length: %d' % len(trainset)) logger.info('testset length: %d' % len(testset)) logger.info('vocabulary length: %d'%len(vocab)) logger.info('labels num: %d'%len(label_encoder)) return (traindata_loader, testdata_loader, trainset, testset, vocab, label_encoder, batch_maker) # dataset = REDataset(TRAINSET_PATH) # corpus = [] # for example in dataset: # corpus += example[0] # # vocab = Vocabulary(word_num=3000) # vocab.add_from_corpus(corpus) # sent = dataset[0][0] # print(sent) # print(vocab.encode(sent)) # print(vocab.decode(vocab.encode(sent))) # print(dataset[0][-1])
def build_vocab(words): ''' Build vocabulary and use it to format labels. ''' vocab = Vocabulary(words) # Map words to word embedding vectors. output_vector = [] for word in words: zeros = np.zeros(len(vocab), dtype=np.float32) zeros[vocab[word]] = 1.0 output_vector.append(zeros) return vocab, output_vector
def __init__(self, filenames, char_vocab=None, feat_vocab=None, pos_vocab=None, pos_sp=True, train=True, covered=False): super().__init__() if isinstance(filenames, list): self.filenames = filenames elif isinstance(filenames, str): self.filenames = [filenames] else: raise ValueError self.train = train if char_vocab is None or feat_vocab is None or pos_vocab is None: assert char_vocab is None and feat_vocab is None and pos_vocab is None # should be None at the same time if char_vocab is None: # if None, create new vocabs self.char_vocab = Vocabulary(unk=True, pad=True, bos=True, eos=True) self.feat_vocab = Vocabulary(unk=True) self.pos_vocab = Vocabulary(unk=True) self.m_char_vocab = self.char_vocab else: # else, load existing vocabs self.char_vocab = char_vocab self.feat_vocab = feat_vocab self.pos_vocab = pos_vocab self.m_char_vocab = Vocabulary.from_vocab(char_vocab) self.raw_data = [] self.data = [] self.organized_data = [] self.data_sizes = [] self.pos_sp = pos_sp self.covered = covered self.build_dataset()
def inference(): with open('train_config.json') as train_config_file: train_config = json.load(train_config_file) vocab_path = train_config['vocab_path'] model_save_path = train_config['model_save_path'] epoch = None with open(os.path.join(model_save_path, 'checkpoint.txt')) as f: epoch = f.readlines()[0].split(':')[1] print(f'Weight is loaded from best checkpoint epoch {epoch}') vocab = Vocabulary(vocab_path) model = Spacing(vocab_len=len(vocab)).eval() trainer = Trainer(model=model, vocab=vocab, config=train_config) trainer.load(epoch) while True: text = input('Enter input text : ') words = text.split() data = [] for word in words: chars = [char for char in word] data.append(chars) sorted_data = sorted(data, key=lambda e: len(e), reverse=True) idx = sorted(range(len(data)), key=lambda e: len(data[e]), reverse=True) batch_data, batch_label, lengths = trainer.make_input_tensor(sorted_data, None) outputs, _ = trainer.model.forward(batch_data, lengths) outputs = torch.round(outputs) results = [] for output, data in zip(outputs, sorted_data): result = '' for output_char, char in zip(output, data): if output_char == 1: result += (char + ' ') else: result += char results.append(result) sorted_result = '' for i in range(len(idx)): sorted_result += results[idx.index(i)] print(sorted_result)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize(args.imsize_pre), transforms.CenterCrop(args.imsize), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) if args.dataset == "coco": val_dset = CocoDataset( root=args.root_path, split="val", transform=transform, ) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater, ) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) model = SPVSE( len(vocab), args.emb_size, args.out_size, args.max_len, args.cnn_type, args.rnn_type, pad_idx=vocab.padidx, bos_idx=vocab.bosidx, ) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") model = model.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) model.load_state_dict(ckpt["model_state"]) _ = validate(1000, val_loader, model, vocab, args)
def build_vocab(datafile, threshold): counter = Counter() with open(datafile, 'r') as f: data = json.load(f) for caption in tqdm(list(map(lambda x: x['caption'], data))): tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) tokens = [token for token, count in counter.items() if count >= threshold] vocab = Vocabulary() vocab.add_tokens(tokens) return vocab
def creatVocab(datalist, is_tags): vocab = Vocabulary() if is_tags else TokenVocabulary() word_counts = Counter(chain(*datalist)) valid_words = [w for w, d in word_counts.items()] valid_words = sorted(valid_words, key=lambda x: word_counts[x], reverse=True) valid_words += ['<pad>'] for token in valid_words: vocab.add_token(token) if not is_tags: unk_index = vocab.add_token('<unk>') vocab.set_unk_index(unk_index) return vocab
def __init__(self, data_path, train=False, longest_sequence_length=None): data0 = load_sent(data_path + '.0') data1 = load_sent(data_path + '.1') print( f'\n------------------------ Building a Dataset ------------------------' ) print(f'#sents of {data_path}.0 file 0: {len(data0)}' ) # list of list of tokenized words print(f'#sents of {data_path}.1 file 1: {len(data1)}' ) # list of list of tokenized words self.data_all = data0 + data1 self.style_list = [0 for i in data0] + [ 1 for i in data1 ] # data0 is all neg, data1 is all pos # sorting all the data according to their seq lengths in descending order zip_item = zip(self.data_all, self.style_list) sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True) tuple_item = zip(*sorted_item) self.data_all, self.style_list = [list(t) for t in tuple_item] print(f'len(self.data_all) : {len(self.data_all)}') print(f'len(self.style_list): {len(self.style_list)}') if train: print('\ntrain: True') if not os.path.isfile(cfg.vocab): print(f'{cfg.vocab} does not exist') print('Building Vocab...') build_vocab(data0 + data1, cfg.vocab) else: print(f'{cfg.vocab} already exists') self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim) print('\nvocabulary size:', self.vocab.size) print( f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}') # print(type(self.vocab.embedding)) # np array self.longest_sequence_length = longest_sequence_length if longest_sequence_length is None: self.update_the_max_length() print(f'self.longest_sequence_length: {self.longest_sequence_length}') print( f'--------------------------------------------------------------------' )
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDataset(root=args.root_path, imgdir='val2017', jsonfile='annotations/captions_val2017.json', transform=transform, mode='all') val_loader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater_eval) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, imenc, vocab, args) retrieve_c2i(dset, val_dset, capenc, vocab, args)
def transform_text(text): tf.compat.v1.disable_eager_execution() args = load_arguments() ah = vars(args) ah['vocab'] = '../model/yelp.vocab' ah['model'] = '../model/model' ah['load_model'] = True ah['beam'] = 8 ah['batch_size'] = 1 inp = [text] vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: model = create_model(sess, args, vocab) decoder = beam_search.Decoder(sess, args, vocab, model) '''test_losses = transfer(model, decoder, sess, args, vocab, test0, test1, args.output)''' batches, order0, order1 = get_batches(inp, inp, vocab.word2id, args.batch_size) data0_tsf, data1_tsf = [], [] losses = Accumulator(len(batches), ['loss', 'rec', 'adv', 'd0', 'd1']) # rec, tsf = decoder.rewrite(inp) # print(rec) # print(tsf) for batch in batches: rec, tsf = decoder.rewrite(batch) half = batch['size'] // 2 print("rec:") print(rec) print("tsf:") print(tsf) data0_tsf += tsf[:half] data1_tsf += tsf[half:] n0, n1 = len(inp), len(inp) data0_tsf = reorder(order0, data0_tsf)[:n0] data1_tsf = reorder(order1, data1_tsf)[:n1] print(data0_tsf) print(data1_tsf)
def run_evaluation(corpus_dir, save_dir, datafile, config_file): config = Config.from_json_file(config_file) vocab = Vocabulary("words") # set checkpoint to load from; set to None if starting from scratch load_filename = os.path.join( save_dir, config.model_name, config.corpus_name, '{}-{}_{}'.format(config.encoder_n_layers, config.decoder_n_layers, config.hidden_size), 'last_checkpoint.tar') # if loading on the same machine the model trained on checkpoint = torch.load(load_filename) # if loading a model trained on gpu to cpu # checkpoint = torch.load(load_filename, map_location=torch.device('cpu')) encoder_sd = checkpoint["en"] decoder_sd = checkpoint["de"] encoder_optimizer_sd = checkpoint["en_opt"] decoder_optimizer_sd = checkpoint["de_opt"] embedding_sd = checkpoint["embedding"] vocab.__dict__ = checkpoint["voc_dict"] print("Building encoder and decoder ...") # initialize word embeddings embedding = nn.Embedding(vocab.num_words, config.hidden_size) embedding.load_state_dict(embedding_sd) # initialize encoder and decoder models encoder = EncoderRNN(config.hidden_size, embedding, config.encoder_n_layers, config.dropout) decoder = LuongAttnDecoderRNN(config.attn_model, embedding, config.hidden_size, vocab.num_words, config.decoder_n_layers, config.dropout) encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) # Set dropout layers to eval mode encoder.eval() decoder.eval() # Initialize search module searcher = GreedySearchDecoder(encoder, decoder) # Begin chatting (uncomment and run the following line to begin) evaluate_input(encoder, decoder, searcher, vocab)
def create_vocab(qas, threshold=4): counter = Counter() for qa in qas: question = qa['question'].encode('utf-8') answer = qa['answer'].encode('utf-8') qtokens = nltk.tokenize.word_tokenize(question.lower()) atokens = nltk.tokenize.word_tokenize(answer.lower()) counter.update(qtokens) counter.update(atokens) # If a word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Adds the words to the vocabulary. vocab = Vocabulary() for word in words: vocab.add_word(word) return vocab
def vectorize(question1,question2,is_duplicate): from vocab import Vocabulary v = Vocabulary() # Vectorize the data. input_texts = [] target_texts = [] input_characters = set() target_characters = set() with open(data_path, 'r', encoding='utf-8') as f: lines = f.read().split('\n') for line in lines[: min(num_samples, len(lines) - 1)]: input_text, target_text = line.split('\t') # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = '\t' + target_text + '\n' input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: if char not in target_characters: target_characters.add(char) input_characters = sorted(list(input_characters)) target_characters = sorted(list(target_characters)) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_length = max([len(txt) for txt in input_texts]) max_decoder_seq_length = max([len(txt) for txt in target_texts]) print('Number of samples:', len(input_texts)) print('Number of unique input tokens:', num_encoder_tokens) print('Number of unique output tokens:', num_decoder_tokens) print('Max sequence length for inputs:', max_encoder_seq_length) print('Max sequence length for outputs:', max_decoder_seq_length) input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)])
def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False, voc=None): if voc == None: voc = Vocabulary('voc') file, file_len = read_file('tiny.txt') # Reads file as giant string for w in file: voc.add_word(w) hidden = decoder.init_hidden(1) prime_input = Variable(voc.word_tensor(prime_str).unsqueeze(0)) if cuda: hidden = hidden.cuda() prime_input = prime_input.cuda() predicted = prime_str # Use priming string to "build up" hidden state for p in range(len(prime_str) - 1): _, hidden = decoder(prime_input[:, p], hidden) inp = prime_input[:, -1] for p in range(predict_len): output, hidden = decoder(inp, hidden) # Sample from the network as a multinomial distribution output_dist = output.data.view(-1).div(temperature).exp() top_i = torch.multinomial(output_dist, 1)[0] top_index = top_i.item() # Add predicted character to string and use as next input predicted_word = voc.to_word(top_index) predicted += predicted_word inp = torch.tensor([top_index]) if cuda: inp = inp.cuda() return predicted
if args.train or args.latent_train: chosen = args.train if len(args.train) > len(args.latent_train) else \ args.latent_train # train0 = load_sent(chosen + '.0', args.max_train_size) # train1 = load_sent(chosen + '.1', args.max_train_size) train0 = load_sent(chosen + 'formal', args.max_train_size) train1 = load_sent(chosen + 'informal', args.max_train_size) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) if args.dev or args.latent_dev: chosen = args.dev if len(args.dev) > len(args.latent_dev) else \ args.latent_dev dev0 = load_sent(chosen + 'formal') dev1 = load_sent(chosen + 'informal') if args.test or args.latent_test: chosen = args.test if len(args.test) > len(args.latent_test) else \ args.latent_test test0 = load_sent(chosen + 'formal') test1 = load_sent(chosen + 'informal') # get condifg object and set dynamic memory aloc