def test_model(loader, model_path): word_set, word2idx, vocab_size = load_vocab(path.vocab_path) idx2word = dict(zip(word2idx.values(), word2idx.keys())) model = torch.load(model_path) #test_by_evidences(model, loader, idx2word) # 0.58 0.62 test_by_questions(model, loader, idx2word) # 0.61 0.66
def main(): train = load_data('data/train.jsonl') token2id = load_vocab('data/vocab.json') vocab_size = get_vocab_size(token2id) embed_size = 20 hidden_size = 40 num_epochs = 15 model = PyTorchModel(vocab_size, embed_size, hidden_size) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) for _ in range(num_epochs): total_loss = 0 with tqdm(train) as pbar: for i, instance in enumerate(pbar): source = torch.LongTensor(instance['source']).unsqueeze(0) target = torch.LongTensor(instance['target']).unsqueeze(0) loss = model(source, target) total_loss += loss.item() average_loss = total_loss / (i + 1) loss_str = f'{average_loss:.4f}' pbar.set_description(loss_str) tqdm.write(str(loss.item())) if torch.isnan(loss): pbar.close() exit() optimizer.zero_grad() loss.backward() optimizer.step()
def ptb_raw_data(data_path=None, vocab_path=None): """Load PTB raw data from data directory "data_path". Reads PTB text files, converts strings to integer ids, and performs mini-batching of the inputs. The PTB dataset comes from Tomas Mikolov's webpage: http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz Args: data_path: string path to the directory where simple-examples.tgz has been extracted. Returns: tuple (train_data, valid_data, test_data, vocabulary) where each of the data objects can be passed to PTBIterator. """ train_path = os.path.join(data_path, "mrc.train.txt") valid_path = os.path.join(data_path, "mrc.valid.txt") test_path = os.path.join(data_path, "mrc.test.txt") if vocab_path == None: word_to_id = _build_vocab(train_path) else: word_to_id = load_vocab(vocab_path) print('load {} words'.format(len(word_to_id))) train_data = _file_to_word_ids(train_path, word_to_id) valid_data = _file_to_word_ids(valid_path, word_to_id) test_data = _file_to_word_ids(test_path, word_to_id) vocabulary = len(word_to_id) return train_data, valid_data, test_data, vocabulary
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input .npy training data", required=True) parser.add_argument("-s", "--seqlen", help="Sequence length") args = parser.parse_args() if not args.seqlen: sequence_length = DEFAULT_SEQ_LEN else: sequence_length = int(args.seqlen) df = pd.read_csv("data/songdata.zip") path = Path("chars.pkl") chars = list() if path.is_file(): chars = util.load_vocab(path) print("Loaded from file") else: vocab = set() for song in df["text"]: chars = set(song) vocab = vocab.union(chars) chars = list(vocab) util.write_vocab(path, chars) print("Generated from source") vocab_size = len(chars) print("Vocab size:", vocab_size) data = np.load(args.input) X = data[:, :-1] Y = data[:, -1] kfold = KFold(n_splits=4) scores = np.zeros((4,)) for (i, (train_index, test_index)) in enumerate(kfold.split(X)): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] checkpoint = ModelCheckpoint("weights/weights_char_k{}_{}.h5".format(i, "{epoch:01d}"), monitor='loss', verbose=1, mode='auto', period=1, save_weights_only=True) model = build_model(sequence_length, vocab_size) model.fit_generator(generate_batches(X_train, Y_train, BATCH_SIZE, vocab_size), samples_per_epoch=300, epochs=10, callbacks=[checkpoint]) perp = perplexity_score(model, X_test, Y_test, vocab_size) print("Local perplexity:", perp) scores[i] = perp del X_train, X_test, Y_train, Y_test del model gc.collect() print("Perplexity: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input .npy training data", required=True) parser.add_argument("-v", "--vocab", help="Training vocab", required=True) parser.add_argument("-s", "--seqlen", help="Sequence length") args = parser.parse_args() if not args.seqlen: sequence_length = DEFAULT_SEQ_LEN else: sequence_length = int(sequence_length) w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True) words = util.load_vocab(args.vocab) vocab_size = len(words) print("Vocab size:", vocab_size) print("W2V vocab size:", len(w2v.vocab)) idx2word = {i: word for i, word in enumerate(words)} embedding_size = w2v.vector_size + util.EMBEDDING_EXT print("Embedding size:", embedding_size) data = np.load(args.input) X = data[:, :-1] Y = data[:, -1] kfold = KFold(n_splits=4) scores = np.zeros((4, )) for (i, (train_index, test_index)) in enumerate(kfold.split(X)): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] checkpoint = ModelCheckpoint("weights/weights_word_k{}_{}.h5".format( i, "{epoch:01d}"), monitor='loss', verbose=1, mode='auto', period=1, save_weights_only=True) model = build_model(vocab_size, sequence_length, embedding_size) model.fit_generator(generate_batches(X_train, Y_train, BATCH_SIZE, embedding_size, idx2word, w2v), samples_per_epoch=300, epochs=4, callbacks=[checkpoint]) perp = perplexity_score(model, X_test, Y_test, idx2word, w2v) print("Local perplexity:", perp) scores[i] = perp print("Perplexity: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def train_featQA(train_loader, val_loader, param): # 2 is not weight param.model_dir = '../model/baselineQA_' + str( datetime.now()).split('.')[0].split()[0] + '/' if os.path.exists(param.model_dir) == False: os.mkdir(param.model_dir) word_set, word2idx, vocab_size = load_vocab(path.vocab_path) idx2word = dict(zip(word2idx.values(), word2idx.keys())) model = baselineQA(vocab_size, param, 0).cuda() train(model, train_loader, val_loader, param, idx2word)
def init_embedding(self, embedding_dim): # use `pre-trained` embedding layer, this trick is learned from machinelearningmastery.com vocab = load_vocab() word2idx = {w: i for i, w in enumerate(vocab)} raw_embedding = load_embedding(config.path_embedding) embedding_weight = get_weight_matrix(raw_embedding, word2idx) embedding = nn.Embedding(len(vocab), embedding_dim=embedding_dim) embedding.weight = nn.Parameter( torch.from_numpy(embedding_weight).float()) return embedding
def main(): # load vocab fit to http://www.nature.com/articles/srep00196wget vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt') foods = [tup[0] for tup in vocab] recipes = util.load_recipes('../dat/train.json') recipes = [recipe for recipe in recipes if len(recipe[1]) < 30] # recipes = recipes[0:1000] # debug on smaller dataset country2region = get_country2region() # match the kaggle data to the nature data vocabulary # parsed_recipes = util.parse_recipes(foods, country2region, recipes) parsed_recipes = util.parse_recipes_parallel(foods, country2region, recipes) util.write_recipes(parsed_recipes, 'kaggle_recipes.csv')
def main(): # load vocab fit to http://www.nature.com/articles/srep00196wget vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt') foods = [tup[0] for tup in vocab] recipes = util.load_recipes('./train.json') recipes = [recipe for recipe in recipes if len(recipe[1]) < 30] # recipes = recipes[0:1000] # debug on smaller dataset country2region = get_country2region() # match the kaggle data to the nature data vocabulary # parsed_recipes = util.parse_recipes(foods, country2region, recipes) parsed_recipes = util.parse_recipes_parallel(foods, country2region, recipes) util.write_recipes(parsed_recipes, 'kaggle_recipes.csv')
def test(args): """ 予測を行うメソッド """ batchsize = args.batchsize # バッチサイズ # 語彙辞書の読込 src_vocab2id, src_id2vocab, vocab_size = util.load_vocab(args.model + ".srcvocab") # モデルの読込 model = NLM.load_spec(args.model + ".spec") # GPUを使うかどうか if args.use_gpu: cuda.check_cuda_available() cuda.get_device(1).use() model.to_gpu() xp = cuda.cupy if args.use_gpu else np # args.gpu <= 0: use cpu, otherwise: use gpu serializers.load_hdf5(args.model + ".weights", model) # Source sequence for test print 'loading source data for test...' # データセット読み込み test_src_dataset = util.load_test_src_data(args.src, src_vocab2id) generated = 0 N = len(test_src_dataset) # テストの事例数 word_list = src_vocab2id.keys() # 単語wordのembeddingを取得 word_id_list = Variable(xp.asarray([src_vocab2id[word] for word in word_list ], dtype=xp.int32)) embedding_list = model.get_embedding(word_id_list) src_embed = embedding_list.data[word_list.index(args.src_word)] #print model.embed.W.data.shape print "src word:", args.src_word print src_embed #src_embed = model.embed.W.data[src_vocab2id[args.src_word]] trg_embed_list = {} for _word, _id in src_vocab2id.items(): trg_embed = embedding_list.data[word_list.index(_word)] #trg_embed = model.embed.W.data[src_vocab2id[_word]] trg_embed_list[_word] = 1 - scipy.spatial.distance.cosine(src_embed, trg_embed) # 上位10件を表示 for i, (word, sim) in enumerate(sorted(trg_embed_list.items(), key=lambda x:x[1], reverse=True)): print word, sim if i == 10: break
def from_paths(cls, weights_path, vocab_path, sequence_length): """ Loads a character sampler from the specified paths. Args: weights_path: Path to the weights of the character-level language model vocab_path: Pickled character vocabulary file path sequence_length: Sequence length of the used model """ chars = util.load_vocab(vocab_path) model = build_character_level_model(len(chars), sequence_length) model.load_weights(weights_path) return cls(model, chars, sequence_length)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--seqlen", help="Sequence length") parser.add_argument("-o", "--output", help="Output .npy path", required=True) args = parser.parse_args() if not args.seqlen: seq_len = DEFAULT_SEQ_LEN else: seq_len = int(args.seqlen) df = pd.read_csv(SRC_PATH) path = Path("chars.pkl") chars = list() if path.is_file(): chars = util.load_vocab(path) print("Loaded vocabulary from file") else: vocab = set() for song in df["text"]: chars = set(song) vocab = vocab.union(chars) chars = list(vocab) util.write_vocab(path, chars) print("Generated character vocabulary as chars.pkl") vocab_size = len(chars) print("Vocab size:", vocab_size) char2idx = {char: i for i, char in enumerate(chars)} print("Generating training samples...") buffer_size = BUFFER_INC buffer = np.zeros((buffer_size, seq_len + 1), dtype=np.int64) i = 0 for song in tqdm(df['text']): for xs in build_samples(song, seq_len): buffer[i] = [char2idx[x] for x in xs] i += 1 if i >= buffer_size: buffer_size += BUFFER_INC buffer.resize((buffer_size, seq_len + 1)) buffer.resize(i, seq_len + 1) print("Saving to {}...".format(args.output)) np.save(args.output, buffer)
def from_paths(cls, weights_path, vocab_path, sequence_length): """ Loads a word sampler from the specified paths. Args: weights_path: Path to the weights of the word-level language model vocab_path: Pickled word vocabulary file path sequence_length: Sequence length of the used model """ w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True) words = util.load_vocab(vocab_path) embedding_size = w2v.vector_size + util.EMBEDDING_EXT model = build_word_level_model(len(words), sequence_length, embedding_size) model.load_weights(weights_path) return cls(model, w2v, words, sequence_length, embedding_size)
def parse_scraped_site(in_path, out_path): """load vocab fit to nature and kaggle data from http://www.nature.com/articles/srep00196wget https://www.kaggle.com/c/whats-cooking/download/train.json.zip and match it to scraped allrecipes data """ vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt') foods = [tup[0] for tup in vocab] ingredients_lists = load_ingredients_lists(in_path) # ingredients_lists = ingredients_lists[0:1000] # for debugging ingredients_lists = [util.filter_stopwords(l) for l in ingredients_lists] parsed_ingredients = util.parse_ingredients_parallel( foods, ingredients_lists) parsed_recipes = [("Unknown", ingredients) for ingredients in parsed_ingredients] util.write_recipes(parsed_recipes, out_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input vocabulary", required=True) parser.add_argument("-s", "--seqlen", help="Sequence length") parser.add_argument("-o", "--output", help="Output .npy path", required=True) args = parser.parse_args() if not args.seqlen: seq_len = DEFAULT_SEQ_LEN else: seq_len = int(args.seqlen) print("Reading songs...") songs = None with open(SRC_PATH, "r") as f: songs = [line.rstrip().split(" ") for line in f] print("Loading vocab...") words = util.load_vocab(args.input) word2idx = {word: i for i, word in enumerate(words)} buffer_size = BUFFER_INC print("Generating ngrams...") buffer = np.zeros((buffer_size, seq_len + 1), dtype=np.int64) i = 0 for song in tqdm(songs): for xs, y in ngramify(song, seq_len, word2idx): xs = [word2idx[x] for x in xs] y = word2idx[y] xs.append(y) buffer[i] = xs i += 1 if i >= buffer_size: buffer_size += BUFFER_INC buffer.resize((buffer_size, seq_len + 1)) buffer.resize(i, seq_len + 1) print("Saving to {}...".format(args.output)) np.save(args.output, buffer)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", "--threshold", help="Threshold to drop words") parser.add_argument("-o", "--output", help="Output .pkl path", required=True) args = parser.parse_args() if not args.threshold: threshold = DEFAULT_THRESH else: threshold = float(args.threshold) print("Threshold:", threshold) print("Calculating word frequencies...") freqs = dict() with open(SRC_PATH, "r") as f: for line in f: for token in line.rstrip().split(" "): if token not in freqs: freqs[token] = 1 else: freqs[token] += 1 total_words = len(freqs.keys()) discard = set() for word in freqs.keys(): z = freqs[word] / total_words p = (math.sqrt(z / threshold) + 1) * (threshold / z) if random() <= p: discard.add(word) print("Total words:", total_words) print("Discarded words:", len(discard)) print("Target vocab size:", total_words - len(discard)) words = util.load_vocab(SRC_VOCAB_PATH) new_words = list(set(words).difference(discard)) util.write_vocab(args.output, new_words) print("Vocab written to:", args.output)
grad_norm, param_norm, iter_time, mean_length, std_length)) ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) valid_costs, valid_lengths = [], [] for source_tokens, source_mask, target_tokens, target_mask in PairIter( x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers): cost, _ = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) valid_costs.append(cost * target_mask.shape[1]) valid_lengths.append(np.sum(target_mask[1:, :])) valid_cost = sum(valid_costs) / float(sum(valid_lengths)) print("Epoch %d Validation cost: %f" % (epoch, valid_cost)) previous_losses.append(valid_cost) if len(previous_losses) > 2 and valid_cost > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) sys.stdout.flush() if __name__ == '__main__': np.random.seed(FLAGS.seed) word_idx_map, idx_word_map = load_vocab('')
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np efficient_gpu = False if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) efficient_gpu = args.get('efficient_gpu', False) def to_gpu(x): if args['gpu'] >= 0: return chainer.cuda.to_gpu(x) return x # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] input_idx = map(int, args['input_idx'].split(',')) output_idx = map(int, args['output_idx'].split(',')) word_input_idx = input_idx[0] # NOTE: word_idx is first column! additional_input_idx = input_idx[1:] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words vocab_adds = [] if is_train: sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence] for sentence in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) # Additional setup for ad_feat_id in additional_input_idx: sentences_additional_train = [[ feat_obj[ad_feat_id] for feat_obj in sentence ] for sentence in sentences_train] vocab_add = util.build_vocab(sentences_additional_train) vocab_adds.append(vocab_add) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) if args.get('word_emb_file', False): # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_emb_vocab_type = args.get('word_emb_vocab_type') def assert_word_emb_shape(shape1, shape2): err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})''' if shape1 != shape2: err_msg = err_msg.format(str(shape1), str(shape2)) raise ValueError(err_msg) def assert_no_emb(word_vecs): err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`''' if word_vecs.shape[0] == 0: raise ValueError(err_msg) if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings word_vecs, vocab_glove = util.load_glove_embedding_include_vocab( emb_file) vocab = vocab_glove elif word_emb_vocab_type == 'replace_only': word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) assert_no_emb(word_vecs) elif word_emb_vocab_type == 'additional': word_vecs, vocab_glove = util.load_glove_embedding_include_vocab( emb_file) additional_vecs = [] for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]): if word not in vocab: vocab[word] = len(vocab) additional_vecs.append(word_vecs[word_idx]) additional_vecs = np.array(additional_vecs, dtype=np.float32) if args.get('vocab_file', False): vocab_file = args['vocab_file'] vocab = util.load_vocab(vocab_file) if args.get('vocab_char_file', False): vocab_char_file = args['vocab_char_file'] vocab_char = util.load_vocab(vocab_char_file) vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items()) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] tmp_xp = xp if efficient_gpu: tmp_xp = np # use CPU (numpy) def parse_to_word_ids(sentences, word_input_idx, vocab): return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=word_input_idx) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) x_train_additionals = [ parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_dev_additionals = [ parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev] # tag_names = [] tag_names = list( set([ tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys() ])) x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) x_test_additionals = [ parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx) ] cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) init_emb = None if is_train: util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) n_vocab_add = [len(_vadd) for _vadd in vocab_adds] net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=init_emb, char_input_dim=args['n_char_emb'], char_hidden_dim=args['n_char_hidden'], n_label=len(vocab_tags), n_add_feature_dim=args['n_add_feature_emb'], n_add_feature=len(n_vocab_add), n_vocab_add=n_vocab_add, use_cudnn=args['use_cudnn']) my_cudnn(args['use_cudnn']) if args.get('word_emb_file', False): if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data = word_vecs[:] elif word_emb_vocab_type == 'replace_only': assert_no_emb(word_vecs) assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data[word_ids] = word_vecs[:] elif word_emb_vocab_type == 'additional': assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) v_size = additional_vecs.shape[0] net.word_embed.W.data[-v_size:] = additional_vecs[:] if args.get('return_model', False): return net if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] x_additional = [] if len(x_train_additionals): x_additional = [[ to_gpu(_) for _ in x_ad[index:index + batchsize] ] for x_ad in x_train_additionals] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) _, predict_tags = zip(*predict_lists) predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) return predict_lists, sum_loss, predicted_results if args['model_filename']: model_filename = args['model_filename'] serializers.load_hdf5(model_filename, net) if is_test: # predict # model_filename = args['model_filename'] # model_filename = save_dir + model_filename # serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train if dev_file: predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] print 'all_result:', all_result predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 prev_dev_accuracy = 0.0 prev_dev_f = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) x_additional = [] if len(x_train_additionals): x_additional = [[ to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize] ] for x_ad in x_train_additionals] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev, x_dev_additionals) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) logging.info(' f_measure :' + str(all_result[-1])) dev_f = all_result[-1] if prev_dev_f < dev_f: logging.info(' [update best model on dev set!]') dev_list = [prev_dev_f, dev_f] dev_str = ' ' + ' => '.join(map(str, dev_list)) logging.info(dev_str) prev_dev_f = dev_f # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter, input_idx=0, output_idx=-1) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter, input_idx=0, output_idx=-1) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter, input_idx=0, output_idx=-1) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words if is_train: sentences_words_train = [w_obj[0] for w_obj in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] def parse_to_word_ids(sentences): return util.parse_to_word_ids(sentences, xp=xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=0) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=0) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) # if is_train: x_train = parse_to_word_ids(sentences_train) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) # elif is_test: # x_predict = parse_to_word_ids(sentences_predict) # x_char_predict = parse_to_char_ids(sentences_predict) # y_predict = parse_to_tag_ids(sentences_predict) x_dev = parse_to_word_ids(sentences_dev) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_test = parse_to_word_ids(sentences_test) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) cnt_train_unk = sum([xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=None, n_label=len(vocab_tags)) if args['word_emb_file']: # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) net.word_embed.W.data[word_ids] = word_vecs if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) return predict_lists, sum_loss if is_test: # predict model_filename = args['model_filename'] model_filename = save_dir + model_filename serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train predict_pairs, _ = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [ vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict) ] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) output = net(x_data=x, x_char_data=x_char) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev = eval_loop(x_dev, x_char_dev, y_dev) # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
filtered, expanded, unique_coverage / len(filtered_vocab), token_coverage / sum(cnt for k, cnt in filtered_vocab))) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, required=True) # parser.add_argument('--vocab_size', type=int, default=30000) parser.add_argument('--smaller', action="store_true") args = parser.parse_args() dataset = args.dataset # vocab_size = args.vocab_size smaller = args.smaller vocab, vocab_filtered = load_vocab(dataset, smaller) # # load dataset # if dataset.lower() == "reddit": # dataset_path = "./data/Reddit/train-{0}.txt" # elif dataset.lower() == "twitter": # dataset_path = "./data/Twitter/train-{0}.txt" # print("Loading dataset...") # src = load_file(dataset_path.format("src")) # tgt = load_file(dataset_path.format("tgt")) # data = src + tgt # print("number of sentence pairs: ", len(src)) # # vocab # print("Building vocab...") # vocab = Counter([w for l in data for w in l]) # print("vocab size: {0}".format(len(vocab)))
gamma = 0.1 n_epochs = 10 clip = 1 teacher_forcing_ration = 1 coverage_loss_lambda = 1.0 eps = 0.0000001 type = 'predict' if type == 'train': data_directory = os.path.join(parent_directory, 'data') + '\\' # 如果词典存在,则加载 vocab = None if os.path.exists(vocab_file): vocab = load_vocab(vocab_file) # 加载训练数据 source, train_data = build_field_dataset_vocab(data_directory, chat_source_name, chat_target_name, vocab) train_iterator, val_iterator = get_dataset(source, train_data, batch_size) # 保存source的词典 if vocab is None: save_vocab(source.vocab, vocab_file) model, optimizer, scheduler = build_model( source, encoder_embedding_dim, decoder_embedding_dim, hidden_dim,
return top_n def print_top_n(top_n): for k, ((words, i, candidate), prob) in enumerate(top_n): inserted = words[:i] + ['***%s***' % candidate] + words[i:] prediction = ' '.join(inserted) print "%d) P = %.3f: %s" % (k, -prob, prediction) def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('model', help='KenLM n-gram model file (ARPA or binary)') parser.add_argument('vocab', type=argparse.FileType('r'), help='Vocab file') parser.add_argument('-n', type=int, default=5, help='Number of best sentences to report') return parser if __name__ == "__main__": args = opts().parse_args() print >>sys.stderr, "Loading vocab" vocab = load_vocab(args.vocab) print >>sys.stderr, "%d words in vocab" % len(vocab) print >>sys.stderr, "Loading language model" model = kenlm.LanguageModel(args.model) print >>sys.stderr, "Processing sentences" for line in sys.stdin: top_n = find_missing_word(model, vocab, line, args.n) print_top_n(top_n)
import copy import os import random from queue import Queue import torch from torch.autograd import Variable import kenlm import config from dataset import Poemsets from model import PoetryGenerator import util vocab = util.load_vocab() commons = vocab[:2500] # most frequent words word2idx, idx2word = util.load_word2idx_idx2word(vocab) icommons = [word2idx[w] for w in commons] pingzes, yuns = util.read_pingshuiyun(use_index=True) word2vec = util.load_word2vec() LM = os.path.join(os.path.dirname(__file__), '..', 'data', 'rnnpg_data_emnlp-2014', 'partitions_in_Table_2', 'poemlm', 'qts.klm') kmodel = kenlm.Model(LM) poemset = Poemsets( os.path.join(config.dir_rnnpg, "partitions_in_Table_2", "rnnpg", "qtrain_7")) pg = PoetryGenerator(vocab_size=len(vocab), embedding_dim=256,
args = opts().parse_args() print "Loading golden sentences" golden = map(tokenize_words, args.golden) print "Loading locations of removed words" golden_loc = np.asarray(map(int, args.i_removed)) print "Loading predictions" predictions = map(Prediction.parse, args.predicted) assert len(golden) == len(golden_loc) if len(predictions) < len(golden): n = len(predictions) golden = golden[:n] golden_loc = golden_loc[:n] print "Assuming first %d sentences" % n print "Loading syntactic n-gram counts" sngs = load_vocab(args.syntactic_ngrams) print "Loaded %d syntactic ngrams" % len(sngs) sngp = estimate_probabilities(sngs) print "Loading bad syntactic ngrams" bad_ngrams = set(pickle.load(args.bad_syntactic_ngrams)) print "Computing cost for each choice" d = cost_per_choice(golden, golden_loc, predictions) print "Identifying optimal choices" y = np.argmin(d, axis=1) best = [di[yi] for di, yi in izip(d,y)] dx = np.mean(best) error = np.std(best) / np.sqrt(len(best)) print "Best achievable Levenshtein distance: %.3f +/- %.3f" % (dx, error) unk = set(('<s>','</s>','<unk>',UNKNOWN))
from util import load_vocab from online_util import get_inputs, get_answers, get_tuple_answers from baiduSpider import get_evidences STOP_TAG = "#OOV#" class Hyperparameters: vocab_path = '../char_data/vocabulary.txt' random_path = '../char_data/training.h5' #charQA_path = '../model/charQA_2017-08-11/f1-0.5583_0.34799_2' charQA_path = '../model/lossQA_2017-08-14/f1-0.5698_0.26918_5' param = Hyperparameters() word_set, word2idx, word_set_size = load_vocab(param.vocab_path) idx2word = dict(zip(word2idx.values(), word2idx.keys())) def random_sample(): file = h5py.File(param.random_path) nb_samples = len(file['question'][:]) index = random.randint(0, nb_samples - 1) question = file['question'][index] question = ''.join([idx2word[q] for q in question if q != 0]) return question class baselineQA(object): def __init__(self):
torch.cuda.manual_seed(opt.seed) # device device_type = "cuda" if opt.cuda else "cpu" device_ids = None if opt.local_rank is not None: device_type += ":" + str(opt.local_rank) device_ids = [opt.local_rank] device = torch.device(device_type) # tensorboardX writer = SummaryWriter() # load vocabulary for source and target src_vocab, trg_vocab = {}, {} src_vocab["stoi"] = load_vocab(opt.src_vocab) trg_vocab["stoi"] = load_vocab(opt.trg_vocab) src_vocab["itos"] = invert_vocab(src_vocab["stoi"]) trg_vocab["itos"] = invert_vocab(trg_vocab["stoi"]) UNK = "<unk>" SOS = "<sos>" EOS = "<eos>" PAD = "<pad>" opt.enc_pad = src_vocab["stoi"][PAD] opt.dec_sos = trg_vocab["stoi"][SOS] opt.dec_eos = trg_vocab["stoi"][EOS] opt.dec_pad = trg_vocab["stoi"][PAD] opt.enc_num_token = len(src_vocab["stoi"]) opt.dec_num_token = len(trg_vocab["stoi"]) # load dataset for training and validation
tf.app.flags.DEFINE_integer("label_size", 2, "Size of the label.") tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.95, "Learning rate decays by this much.") tf.app.flags.DEFINE_float("max_gradient_norm", 5.0, "Clip gradients to this norm.") tf.app.flags.DEFINE_float("dropout", 0.0, "Fraction of units randomly dropped on non-recurrent connections.") tf.app.flags.DEFINE_integer("batch_size", 50, "Batch size to use during training.") tf.app.flags.DEFINE_integer("epochs", 1, "Number of epochs to train.") tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.") tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training weights are saved in it.") tf.app.flags.DEFINE_integer("print_every", 1, "How many iterations to do per print.") tf.app.flags.DEFINE_integer('seed', 1234, 'random seed') tf.app.flags.DEFINE_string('mode', 'sq2sq', 'Choose [sq2sq, att, gen]') tf.app.flags.DEFINE_string('gpu', '3', 'Choose GPU ID: [0,1,2,3]') tf.app.flags.DEFINE_string('embed', 'w2v', 'Choose embedding: [w2v, glove50, glove100, glove200, glove300]') word_idx_map, idx_word_map = load_vocab(VOCAB_PATH) vocab_size = len(idx_word_map) loader = StoryLoader(STORY_DATA_PATH, batch_size=50, src_seq_len=65, tgt_seq_len=20, mode='merged') if FLAGS.embed == 'w2v': embedding = loader.get_w2v_embed().astype('float32') model = StoryModel(vocab_size, FLAGS.label_size, FLAGS.size, FLAGS.num_layers, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.dropout, embedding, FLAGS.src_steps, FLAGS.tgt_steps, FLAGS.mode, FLAGS.max_gradient_norm, forward_only=False)
exp_norm = 0.99*exp_norm + 0.01*grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: print('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) valid_costs, valid_lengths = [], [] for source_tokens, source_mask, target_tokens, target_mask in PairIter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers): cost, _ = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) valid_costs.append(cost * target_mask.shape[1]) valid_lengths.append(np.sum(target_mask[1:, :])) valid_cost = sum(valid_costs) / float(sum(valid_lengths)) print("Epoch %d Validation cost: %f" % (epoch, valid_cost)) previous_losses.append(valid_cost) if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) sys.stdout.flush() if __name__ == '__main__': np.random.seed(FLAGS.seed) word_idx_map, idx_word_map = load_vocab('')
def run(data_file, is_train=False, **args): is_test = not is_train batchsize = args['batchsize'] model_name = args['model_name'] optimizer_name = args['optimizer'] save_dir = args['save_dir'] print args if save_dir[-1] != '/': save_dir = save_dir + '/' # TODO: check save_dir exist if not os.path.isdir(save_dir): err_msg = 'There is no dir : {}\n'.format(save_dir) err_msg += '##############################\n' err_msg += '## Please followiing: \n' err_msg += '## $ mkdir {}\n'.format(save_dir) err_msg += '##############################\n' raise ValueError(err_msg) save_name = args['save_name'] if save_name == '': save_name = '_'.join([model_name, optimizer_name]) save_name = save_dir + save_name xp = cuda.cupy if args['gpu'] >= 0 else np efficient_gpu = False if args['gpu'] >= 0: cuda.get_device(args['gpu']).use() xp.random.seed(1234) efficient_gpu = args.get('efficient_gpu', False) def to_gpu(x): if args['gpu'] >= 0: return chainer.cuda.to_gpu(x) return x # load files dev_file = args['dev_file'] test_file = args['test_file'] delimiter = args['delimiter'] input_idx = map(int, args['input_idx'].split(',')) output_idx = map(int, args['output_idx'].split(',')) word_input_idx = input_idx[0] # NOTE: word_idx is first column! additional_input_idx = input_idx[1:] sentences_train = [] if is_train: sentences_train = util.read_conll_file(filename=data_file, delimiter=delimiter) if len(sentences_train) == 0: s = str(len(sentences_train)) err_msg = 'Invalid training sizes: {} sentences. '.format(s) raise ValueError(err_msg) else: # Predict sentences_train = util.read_raw_file(filename=data_file, delimiter=u' ') # sentences_train = sentences_train[:100] sentences_dev = [] sentences_test = [] if dev_file: sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter) if test_file: sentences_test = util.read_conll_file(test_file, delimiter=delimiter) save_vocab = save_name + '.vocab' save_vocab_char = save_name + '.vocab_char' save_tags_vocab = save_name + '.vocab_tag' save_train_config = save_name + '.train_config' # TODO: check unkown pos tags # TODO: compute unk words vocab_adds = [] if is_train: sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence] for sentence in sentences_train] vocab = util.build_vocab(sentences_words_train) vocab_char = util.build_vocab(util.flatten(sentences_words_train)) vocab_tags = util.build_tag_vocab(sentences_train) # Additional setup for ad_feat_id in additional_input_idx: sentences_additional_train = [[feat_obj[ad_feat_id] for feat_obj in sentence] for sentence in sentences_train] vocab_add = util.build_vocab(sentences_additional_train) vocab_adds.append(vocab_add) elif is_test: vocab = util.load_vocab(save_vocab) vocab_char = util.load_vocab(save_vocab_char) vocab_tags = util.load_vocab(save_tags_vocab) if args.get('word_emb_file', False): # set Pre-trained embeddings # emb_file = './emb/glove.6B.100d.txt' emb_file = args['word_emb_file'] word_emb_vocab_type = args.get('word_emb_vocab_type') def assert_word_emb_shape(shape1, shape2): err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})''' if shape1 != shape2: err_msg = err_msg.format(str(shape1), str(shape2)) raise ValueError(err_msg) def assert_no_emb(word_vecs): err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`''' if word_vecs.shape[0] == 0: raise ValueError(err_msg) if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file) vocab = vocab_glove elif word_emb_vocab_type == 'replace_only': word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab) assert_no_emb(word_vecs) elif word_emb_vocab_type == 'additional': word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file) additional_vecs = [] for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]): if word not in vocab: vocab[word] = len(vocab) additional_vecs.append(word_vecs[word_idx]) additional_vecs = np.array(additional_vecs, dtype=np.float32) if args.get('vocab_file', False): vocab_file = args['vocab_file'] vocab = util.load_vocab(vocab_file) if args.get('vocab_char_file', False): vocab_char_file = args['vocab_char_file'] vocab_char = util.load_vocab(vocab_char_file) vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items()) PAD_IDX = vocab[PADDING] UNK_IDX = vocab[UNKWORD] CHAR_PAD_IDX = vocab_char[PADDING] CHAR_UNK_IDX = vocab_char[UNKWORD] tmp_xp = xp if efficient_gpu: tmp_xp = np # use CPU (numpy) def parse_to_word_ids(sentences, word_input_idx, vocab): return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab, UNK_IDX=UNK_IDX, idx=word_input_idx) def parse_to_char_ids(sentences): return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char, UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx) def parse_to_tag_ids(sentences): return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags, UNK_IDX=-1, idx=-1) x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab) x_char_train = parse_to_char_ids(sentences_train) y_train = parse_to_tag_ids(sentences_train) x_train_additionals = [parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab) x_char_dev = parse_to_char_ids(sentences_dev) y_dev = parse_to_tag_ids(sentences_dev) x_dev_additionals = [parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev] # tag_names = [] tag_names = list(set([tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()])) x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab) x_char_test = parse_to_char_ids(sentences_test) y_test = parse_to_tag_ids(sentences_test) x_test_additionals = [parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i]) for i, ad_feat_id in enumerate(additional_input_idx)] cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train]) cnt_train_word = sum([d.size for d in x_train]) unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev]) cnt_dev_word = sum([d.size for d in x_dev]) unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1) logging.info('train:' + str(len(x_train))) logging.info('dev :' + str(len(x_dev))) logging.info('test :' + str(len(x_test))) logging.info('vocab :' + str(len(vocab))) logging.info('vocab_tags:' + str(len(vocab_tags))) logging.info('unk count (train):' + str(cnt_train_unk)) logging.info('unk rate (train):' + str(unk_train_unk_rate)) logging.info('cnt all words (train):' + str(cnt_train_word)) logging.info('unk count (dev):' + str(cnt_dev_unk)) logging.info('unk rate (dev):' + str(unk_dev_unk_rate)) logging.info('cnt all words (dev):' + str(cnt_dev_word)) # show model config logging.info('######################') logging.info('## Model Config') logging.info('model_name:' + str(model_name)) logging.info('batchsize:' + str(batchsize)) logging.info('optimizer:' + str(optimizer_name)) # Save model config logging.info('######################') logging.info('## Model Save Config') logging.info('save_dir :' + str(save_dir)) # save vocab logging.info('save_vocab :' + save_vocab) logging.info('save_vocab_char :' + save_vocab_char) logging.info('save_tags_vocab :' + save_tags_vocab) logging.info('save_train_config :' + save_train_config) init_emb = None if is_train: util.write_vocab(save_vocab, vocab) util.write_vocab(save_vocab_char, vocab_char) util.write_vocab(save_tags_vocab, vocab_tags) util.write_vocab(save_train_config, args) n_vocab_add = [len(_vadd) for _vadd in vocab_adds] net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char), emb_dim=args['n_word_emb'], hidden_dim=args['n_hidden'], n_layers=args['n_layer'], init_emb=init_emb, char_input_dim=args['n_char_emb'], char_hidden_dim=args['n_char_hidden'], n_label=len(vocab_tags), n_add_feature_dim=args['n_add_feature_emb'], n_add_feature=len(n_vocab_add), n_vocab_add=n_vocab_add, use_cudnn=args['use_cudnn']) my_cudnn(args['use_cudnn']) if args.get('word_emb_file', False): if word_emb_vocab_type == 'replace_all': # replace all vocab by Pre-trained embeddings assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data = word_vecs[:] elif word_emb_vocab_type == 'replace_only': assert_no_emb(word_vecs) assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) net.word_embed.W.data[word_ids] = word_vecs[:] elif word_emb_vocab_type == 'additional': assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1]) v_size = additional_vecs.shape[0] net.word_embed.W.data[-v_size:] = additional_vecs[:] if args.get('return_model', False): return net if args['gpu'] >= 0: net.to_gpu() init_alpha = args['init_lr'] if optimizer_name == 'adam': opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9) elif optimizer_name == 'adadelta': opt = optimizers.AdaDelta() if optimizer_name == 'sgd_mom': opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9) if optimizer_name == 'sgd': opt = optimizers.SGD(lr=init_alpha) opt.setup(net) opt.add_hook(chainer.optimizer.GradientClipping(5.0)) def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]): # dev or test net.set_train(train=False) iteration_list = range(0, len(x_data), batchsize) # perm = np.random.permutation(len(x_data)) sum_loss = 0.0 predict_lists = [] for i_index, index in enumerate(iteration_list): x = x_data[index:index + batchsize] x_char = x_char_data[index:index + batchsize] target_y = y_data[index:index + batchsize] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] x_additional = [] if len(x_train_additionals): x_additional = [[to_gpu(_) for _ in x_ad[index:index + batchsize]] for x_ad in x_train_additionals] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) sum_loss += loss.data predict_lists.extend(predict) _, predict_tags = zip(*predict_lists) predicted_results = [] for predict in predict_tags: predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)] predicted_results.append(predicted) return predict_lists, sum_loss, predicted_results if args['model_filename']: model_filename = args['model_filename'] serializers.load_hdf5(model_filename, net) if is_test: # predict # model_filename = args['model_filename'] # model_filename = save_dir + model_filename # serializers.load_hdf5(model_filename, net) vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()]) x_predict = x_train x_char_predict = x_char_train y_predict = y_train if dev_file: predict_dev, loss_dev, predict_dev_tags = eval_loop(x_dev, x_char_dev, y_dev) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval( gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] print 'all_result:', all_result predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict) _, predict_tags = zip(*predict_pairs) predicted_output = args['predicted_output'] predicted_results = [] for predict in predict_tags: predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)] predicted_results.append(predicted) f = open(predicted_output, 'w') for predicted in predicted_results: for tag in predicted: f.write(tag + '\n') f.write('\n') f.close() return False tmax = args['max_iter'] t = 0.0 prev_dev_accuracy = 0.0 prev_dev_f = 0.0 for epoch in xrange(args['max_iter']): # train net.set_train(train=True) iteration_list = range(0, len(x_train), batchsize) perm = np.random.permutation(len(x_train)) sum_loss = 0.0 predict_train = [] for i_index, index in enumerate(iteration_list): data = [(x_train[i], x_char_train[i], y_train[i]) for i in perm[index:index + batchsize]] x, x_char, target_y = zip(*data) x_additional = [] if len(x_train_additionals): x_additional = [[to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize]] for x_ad in x_train_additionals] if efficient_gpu: x = [to_gpu(_) for _ in x] x_char = [[to_gpu(_) for _ in words] for words in x_char] target_y = [to_gpu(_) for _ in target_y] output = net(x_data=x, x_char_data=x_char, x_additional=x_additional) predict, loss = net.predict(output, target_y) # loss sum_loss += loss.data # update net.zerograds() loss.backward() opt.update() predict_train.extend(predict) # Evaluation train_accuracy = util.eval_accuracy(predict_train) logging.info('epoch:' + str(epoch)) logging.info(' [train]') logging.info(' loss :' + str(sum_loss)) logging.info(' accuracy :' + str(train_accuracy)) # Dev predict_dev, loss_dev, predict_dev_tags = eval_loop( x_dev, x_char_dev, y_dev, x_dev_additionals) gold_predict_pairs = [y_dev_cpu, predict_dev_tags] result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names) all_result = result['All_Result'] # Evaluation dev_accuracy = util.eval_accuracy(predict_dev) logging.info(' [dev]') logging.info(' loss :' + str(loss_dev)) logging.info(' accuracy :' + str(dev_accuracy)) logging.info(' f_measure :' + str(all_result[-1])) dev_f = all_result[-1] if prev_dev_f < dev_f: logging.info(' [update best model on dev set!]') dev_list = [prev_dev_f, dev_f] dev_str = ' ' + ' => '.join(map(str, dev_list)) logging.info(dev_str) prev_dev_f = dev_f # Save model model_filename = save_name + '_epoch' + str(epoch) serializers.save_hdf5(model_filename + '.model', net) serializers.save_hdf5(model_filename + '.state', opt)
# build the model according to params if not args.load_model: logger, opts = setup_exp(args) logger.info(args) else: logger = get_logger(args.expdir) with open(pjoin(args.load_model, 'opts.json'), 'r') as fin: loaded_opts = json.load(fin) for k in loaded_opts: if k not in ['expdir', 'load_model', 'seed', 'save_err']: setattr(args, k, loaded_opts[k]) logger.info(args) logger.info('loading data...') word_idx_map, idx_word_map = load_vocab(VOCAB_PATH) vocab_size = len(idx_word_map) if not args.pretrain: # we train on validation/test sets loader = StoryLoader(STORY_DATA_PATH, batch_size=args.batch_size, src_seq_len=65, tgt_seq_len=20, train_frac=0.45, valid_frac=0.05, mode='merged') else: # we only train on 40% of validation for the target encoder loader = StoryLoader(STORY_DATA_PATH, batch_size=args.batch_size,
import sys, argparse from collections import defaultdict from util import tokenize_words, load_vocab def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('vocab', type=argparse.FileType('r'), help='File with vocabulary') return parser if __name__ == "__main__": args = opts().parse_args() print("Loading vocab", file=sys.stderr) vocab = load_vocab(args.vocab) print("Determining best-guess case for each word", file=sys.stderr) lower_to_case = {} for word, freq in vocab.items(): lowercase = word.lower() if lowercase in lower_to_case: prev_freq = lower_to_case[lowercase][1] if freq > prev_freq: lower_to_case[lowercase] = (word, freq) else: lower_to_case[lowercase] = (word, freq) del vocab for k in list(lower_to_case.keys()): lower_to_case[k] = lower_to_case[k][0] print("Processing predictions", file=sys.stderr)
#!/usr/bin/env python '''Replace words with their word2vec class''' import sys, argparse from util import tokenize_words, load_vocab, UNKNOWN def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('classes', type=argparse.FileType('r'), help='File with word2vec classes') return parser if __name__ == "__main__": args = opts().parse_args() print("Loading word2vec classes", file=sys.stderr) vocab = load_vocab(args.classes) for i, line in enumerate(sys.stdin): words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)] print(' '.join(map(str, words))) if i % 100000 == 0: print(i, file=sys.stderr)
args = opts().parse_args() print("Loading golden sentences") golden = list(map(tokenize_words, args.golden)) print("Loading locations of removed words") golden_loc = np.asarray(list(map(int, args.i_removed))) print("Loading predictions") predictions = list(map(Prediction.parse, args.predicted)) assert len(golden) == len(golden_loc) if len(predictions) < len(golden): n = len(predictions) golden = golden[:n] golden_loc = golden_loc[:n] print("Assuming first %d sentences" % n) print("Loading syntactic n-gram counts") sngs = load_vocab(args.syntactic_ngrams) print("Loaded %d syntactic ngrams" % len(sngs)) sngp = estimate_probabilities(sngs) print("Loading bad syntactic ngrams") bad_ngrams = set(pickle.load(args.bad_syntactic_ngrams)) print("Computing cost for each choice") d = cost_per_choice(golden, golden_loc, predictions) print("Identifying optimal choices") y = np.argmin(d, axis=1) best = [di[yi] for di, yi in zip(d,y)] dx = np.mean(best) error = np.std(best) / np.sqrt(len(best)) print("Best achievable Levenshtein distance: %.3f +/- %.3f" % (dx, error)) unk = set(('<s>','</s>','<unk>',UNKNOWN))
embedding_weight = get_weight_matrix(raw_embedding, word2idx) embedding = nn.Embedding(len(vocab), embedding_dim=embedding_dim) embedding.weight = nn.Parameter( torch.from_numpy(embedding_weight).float()) return embedding if __name__ == '__main__': print("TOY EXAMPLE, JUST FOR TEST!!!") import numpy as np import torch.optim as optim vocab = load_vocab() word2idx, idx2word = load_word2idx_idx2word(vocab) poetry = "鹤 湖 东 去 水 茫 茫 一 面 风 泾 接 魏 塘 看 取 松 江 布 帆 至 鲈 鱼 切 玉 劝 郎 尝" sentences = [s.split() for s in poetry.split("\t")] isentences = [[word2idx[w] for w in s] for s in sentences] print(sentences) print(isentences) batch_size = 1 epochs = 10 # 经过长时间的训练, 程序能够"记"住一些信息 # optimizer parameters lr = 0.01 decay_factor = 0.00001 betas = (0.9, 0.999)
train_set = NameTaggingDataset(os.path.join( args.input, dataset, '{}train.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) dev_set = NameTaggingDataset(os.path.join(args.input, dataset, '{}dev.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) test_set = NameTaggingDataset(os.path.join( args.input, dataset, '{}test.tsv'.format(args.prefix)), conll_parser, gpu=use_gpu) # embedding vocab if args.embed_vocab: embed_vocab = load_vocab(args.embed_vocab) else: embed_vocab = build_embedding_vocab(args.embed) # vocabulary token_vocab = load_vocab( os.path.join(args.input, dataset, '{}token.vocab.tsv'.format(args.prefix))) char_vocab = load_vocab( os.path.join(args.input, dataset, '{}char.vocab.tsv'.format(args.prefix))) label_vocab = load_vocab( os.path.join(args.input, dataset, '{}label.vocab.tsv'.format(args.prefix))) label_itos = {i: l for l, i in label_vocab.items()} train_token_counter = train_set.token_counter
config.read(os.path.join(parent_directory, 'resource') + '/config.cfg') section = config.sections()[0] parser = argparse.ArgumentParser(description='seq2seq_attention_chatbot') parser.add_argument('-type', default='train', help='train or predict with seq2seq!', type=str) args = parser.parse_args() if args.type == 'train': data_directory = os.path.join(parent_directory, 'data') + '\\' #如果词典存在,则加载 vocab = None if os.path.exists(config.get(section, 'vocab')): vocab = load_vocab(config.get(section, 'vocab')) #加载训练数据 source, train_iterator, val_iterator = build_field_dataset_vocab( data_directory, config.get(section, 'chat_source_name'), config.get(section, 'chat_target_name'), vocab) #保存source的词典 if vocab is None: save_vocab(source.vocab, config.get(section, 'vocab')) model, optimizer, scheduler, criterion = build_model( source, config.getint(section, 'encoder_embedding_dim'), config.getint(section, 'decoder_embedding_dim'), config.getint(section, 'hidden_dim'), config.getint(section, 'n_layers'), config.getfloat(section, 'encoder_dropout'),
parser.add_argument('--info', type=str, help='info of the model') opt = parser.parse_args() # set the random seed manually torch.manual_seed(opt.seed) opt.cuda = opt.cuda and torch.cuda.is_available() if opt.cuda: torch.cuda.manual_seed(opt.seed) device = torch.device('cuda' if opt.cuda else 'cpu') # load vocabulary for source and target src_vocab, trg_vocab = {}, {} src_vocab['stoi'] = load_vocab(opt.src_vocab) trg_vocab['stoi'] = load_vocab(opt.trg_vocab) src_vocab['itos'] = invert_vocab(src_vocab['stoi']) trg_vocab['itos'] = invert_vocab(trg_vocab['stoi']) UNK = '<unk>' SOS = '<sos>' EOS = '<eos>' PAD = '<pad>' opt.enc_pad = src_vocab['stoi'][PAD] opt.dec_sos = trg_vocab['stoi'][SOS] opt.dec_eos = trg_vocab['stoi'][EOS] opt.dec_pad = trg_vocab['stoi'][PAD] opt.enc_ntok = len(src_vocab['stoi']) opt.dec_ntok = len(trg_vocab['stoi']) # load dataset for testing
#!/usr/bin/env python '''Replace words with their word2vec class''' import sys, argparse from util import tokenize_words, load_vocab, UNKNOWN def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('classes', type=argparse.FileType('r'), help='File with word2vec classes') return parser if __name__ == "__main__": args = opts().parse_args() print >>sys.stderr, "Loading word2vec classes" vocab = load_vocab(args.classes) for i, line in enumerate(sys.stdin): words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)] print ' '.join(map(str, words)) if i % 100000 == 0: print >>sys.stderr, i