def main(): import argparse hlp = "Compute some stats about alignments." parser = argparse.ArgumentParser(hlp) parser.add_argument('--corpus', required=True, help="Corpus to use.", choices=["dailymail", "cnn"]) parser.add_argument('--data-path', required=True, help="Path to Cheng&Lapata data.") parser.add_argument("--alignments-path", required=True, help="Path to token alignments.") parser.add_argument("--output-vocab", required=True, help="Location to write output vocab.") args = parser.parse_args() #vocab_in = read_vocab(args.input_vocab) vocab_out = read_vocab(args.output_vocab) for split in ["training", "validation", "test"]: print("Split: {}".format(split)) data_path = os.path.join(args.data_path, args.corpus, split) alignments_path = os.path.join( args.alignments_path, args.corpus, split) collect_split_stats(data_path, alignments_path, vocab_out)
def __init__(self, train_input_file, train_target_file, test_input_file, test_target_file, vocab_file, num_units, layers, dropout, batch_size, learning_rate, output_dir, save_step=100, eval_step=1000, param_histogram=False, restore_model=False, init_train=True, init_infer=False): self.num_units = num_units self.layers = layers self.dropout = dropout self.batch_size = batch_size self.learning_rate = learning_rate self.save_step = save_step self.eval_step = eval_step self.param_histogram = param_histogram self.restore_model = restore_model self.init_train = init_train self.init_infer = init_infer if init_train: self.train_reader = data.SeqReader(train_input_file, train_target_file, vocab_file, batch_size) #训练 self.train_reader.start() self.train_data = self.train_reader.read() self.eval_reader = data.SeqReader(test_input_file, test_target_file, vocab_file, batch_size) #测试 self.eval_reader.start() self.eval_data = self.eval_reader.read() self.model_file = path.join(output_dir, 'model.ckpl') self.log_writter = tf.summary.FileWriter(output_dir) if init_train: self._init_train() self._init_eval() if init_infer: self.infer_vocabs = data.read_vocab(vocab_file) self.infer_vocab_indices = dict( (c, i) for i, c in enumerate(self.infer_vocabs)) self._init_infer() self.reload_infer_model()
def evaluate_line(): config_path = os.path.join(FLAGS.config_path, 'config') test_config = load_config(config_path) _, word_to_id = read_vocab(test_config['vocab_file']) categorys, cat_to_id = read_category() contents, labels = read_file('data/cnews.val2.txt') model = Model(test_config) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() # 读取模型 checkpoint_path = os.path.join(FLAGS.checkpoints_path) checkpoint_file = tf.train.latest_checkpoint(checkpoint_path) saver.restore(session, checkpoint_file) while True: line = input("请输入测试句子:") x_input = [[word_to_id[x] for x in line if x in word_to_id]] x_pad = kr.preprocessing.sequence.pad_sequences(x_input, 600) predict = model.evaluate(session, x_pad) print(categorys[predict[0][0]])
def get_train_config(): train_contents, train_labels = read_file(FLAGS.train_file) # 1.先构建训练数据的词汇字典 if not os.path.exists(FLAGS.vocab_file): words = build_vocab(train_contents, FLAGS.vocab_file) else: words, _ = read_vocab(FLAGS.vocab_file) # 2.获取分类数据,构建分类数据的字典表,并保存至文件中 categories, cat_to_id = read_category() # 3.生成训练配置文件 vocab_size = len(words) num_classes = len(categories) #长度太大会内存溢出 # seq_len = max([len(content) for content in train_contents]) seq_len = 600 filter_sizes = [int(i) for i in FLAGS.filter_sizes.split(',')] # 生成环境配置文件 make_path(FLAGS) config_path = os.path.join(FLAGS.config_path, 'config') if not os.path.isfile(config_path): train_config = config_model(seq_len, vocab_size, num_classes, filter_sizes) save_config(train_config, config_path) return train_config
from itertools import chain import nltk import numpy as np import data_utils vocab = data_utils.read_vocab(data_utils.vocab_path) X, Y = data_utils.prepare_data(data_utils.pos_train, vocab) # pos = nltk.pos_tag() # print pos # X_indexes = [] # for window in X: # window_indexes = [] # for word in window: # if(word in vocab): # window_indexes.append(vocab[word]) # else: # window_indexes.append(vocab['UUUNKKK']) # X_indexes.append(window_indexes) # print(Y[:1]) print(X[:1]) # print X_indexes[0] # embeddings = data_utils.read_embeddings(data_utils.wv_path)
def main(): PRETRAIN_EMBEDDINGS = False USE_SUBWORDS = True if len(sys.argv) > 1: PRETRAIN_EMBEDDINGS = sys.argv[1] if len(sys.argv) > 2: USE_SUBWORDS = sys.argv[2] print('pre-trained embeddings is set to %s' % PRETRAIN_EMBEDDINGS) print('sub-words embeddings is set to %s' % USE_SUBWORDS) EPOCHS_TO_TRAIN = 100 CONTEXT_SIZE = 5 BATCH_SIZE = 1000 WORKERS = 2 vocab = data_utils.read_vocab(data_utils.vocab_path) embeddings = np.random.randn(len(vocab), 50) / np.sqrt(len(vocab)) EMBEDDING_DIM = len(embeddings[0]) vocab_reversed = {} if PRETRAIN_EMBEDDINGS: embeddings = data_utils.read_embeddings(data_utils.wv_path) if USE_SUBWORDS: embeddings, vocab, vocab_reversed = data_utils.generate_embeddings_with_prefixes( embeddings, vocab, EMBEDDING_DIM) print('Starting execution...') print('using EMBEDDING_DIM of %s' % EMBEDDING_DIM) print('using CONTEXT_SIZE of %s' % CONTEXT_SIZE) print('using BATCH_SIZE of %s' % BATCH_SIZE) print('using EPOCHS_TO_TRAIN of %s' % EPOCHS_TO_TRAIN) class Net(nn.Module): def __init__(self, vocab_size, embed_dim, context_size, pretrained_embeddings): super(Net, self).__init__() self.embeddings = nn.Embedding(vocab_size, embed_dim) self.embeddings.weight.data.copy_( torch.from_numpy(pretrained_embeddings)) # if PRETRAIN_EMBEDDINGS: # self.embeddings.weight.requires_grad = False self.linear1 = nn.Linear(context_size * embed_dim, 128) self.linear2 = nn.Linear(128, len(data_utils.POS_TAGS)) def forward(self, x): embeds = self.embeddings(x).view( (-1, CONTEXT_SIZE * EMBEDDING_DIM)) if USE_SUBWORDS: prefixes = get_prefixes_embeddings(x, vocab, vocab_reversed) suffixes = get_suffixes_embeddings(x, vocab, vocab_reversed) prefixes_tensor = Variable( (torch.from_numpy(prefixes)).type(torch.LongTensor)) suffixes_tensor = Variable( (torch.from_numpy(suffixes)).type(torch.LongTensor)) prefixes_embeds = self.embeddings(prefixes_tensor).view( (-1, CONTEXT_SIZE * EMBEDDING_DIM)) suffixes_embeds = self.embeddings(suffixes_tensor).view( (-1, CONTEXT_SIZE * EMBEDDING_DIM)) embeds = embeds + prefixes_embeds + suffixes_embeds out = F.tanh(self.linear1(embeds)) out = F.log_softmax(self.linear2(out)) return out net = Net(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, embeddings) print('Preparing train/test/dev sets') train_data_loader = data_utils.prepare_tensor_dataset( data_utils.pos_train, vocab, WORKERS, BATCH_SIZE) dev_data_loader = data_utils.prepare_tensor_dataset( data_utils.pos_dev, vocab, WORKERS, BATCH_SIZE) test_data_loader = data_utils.prepare_tensor_dataset(data_utils.pos_test, vocab, WORKERS, BATCH_SIZE, include_y=False) criterion = nn.NLLLoss() optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.01) dev_losses = [] train_losses = [] acceptances = [] iterations = [] print("Starting training loop") for idx in range(0, EPOCHS_TO_TRAIN): for iteration, batch in enumerate(train_data_loader, 1): x, y = Variable(batch[0]), Variable(batch[1]) optimizer.zero_grad() output = net(x) loss = criterion(output, y) loss.backward() optimizer.step() if idx % 1 == 0: # calculate accuracy on validation set dev_loss = 0 net.eval() correct = 0.0 total = 0.0 for dev_batch_idx, dev_batch in enumerate(dev_data_loader): x, y = Variable(dev_batch[0]), Variable(dev_batch[1]) output = net(x) dev_loss = criterion(output, y) _, predicted = torch.max(output.data, 1) total += dev_batch[1].size(0) correct += (predicted == dev_batch[1]).sum() acc = correct / total acceptances.append(acc) train_losses.append(loss.data[0]) dev_losses.append(dev_loss.data[0]) iterations.append(idx) print( "Epoch {: >8} TRAIN_LOSS: {: >8} DEV_LOSS: {: >8} ACC: {}" .format(idx, loss.data[0], dev_loss.data[0], acc)) print("Predicting the test file") net.eval() test_file = open(os.path.join(data_utils.POS_DIR, "test_results.txt"), 'w') for test_batch_idx, test_batch in enumerate(test_data_loader): x, y = Variable(test_batch[0]), Variable(test_batch[1]) output = net(x) predictions = torch.max(output.data, 1)[1].numpy() for pos_index in predictions: test_file.write(data_utils.get_pos_name_by_index(pos_index) + "\n") test_file.close() print('Finished execution!') print('Plotting graphs..') fig = plt.figure() fig.suptitle("POS - random word vectors with prefix/suffix", fontsize=14) ax = plt.subplot(211) ax.set_xlabel('Iterations') ax.set_ylabel('Train loss') ax.plot(iterations, train_losses, 'k') ax = plt.subplot(212) ax.set_xlabel('Iterations') ax.set_ylabel('Dev loss') ax.plot(iterations, dev_losses, 'k') ax = plt.subplot(213) ax.set_xlabel('Iterations') ax.set_ylabel('Acc') ax.plot(iterations, acceptances, 'k') plt.show()
def train(): train_config = get_train_config() _, word_to_id = read_vocab(train_config['vocab_file']) _, cat_to_id = read_category() # 获得日志 logger = get_logger(os.path.join(FLAGS.log_path, 'train.log')) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=config) as sess: # 创建模型 model = Model(train_config) # 获取数据训练 x_train_data, y_train_data = process_file(train_config['train_file'], word_to_id, cat_to_id, train_config['seq_len']) #获取验证数据集 x_val_data, y_val_data = process_file(train_config['val_file'], word_to_id, cat_to_id, train_config['seq_len']) #初始化变量 sess.run(tf.global_variables_initializer()) len_data = len(y_train_data) #数据样本数量 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 flag = False #是否结束训练 #num_epochs:防止前面的学习丢失了一些特征,需要重复学习样本 for i in range(train_config['num_epochs']): for x_input, y_output in batch_iter(x_train_data, y_train_data, train_config['batch_size']): total_batch += 1 step, acc, loss = model.run_step(sess, x_input, y_output) #迭代100次评估一次模型 if (total_batch % FLAGS.evaluate_every == 0): time_dif = get_time_dif(start_time) logger.info( "train: iterator{}: step:{}/{} acc:{} loss:{} time:{}". format(i + 1, step % len_data, len_data, acc, loss, time_dif)) val_acc, text_los = evaluate(sess, model, x_val_data, y_val_data) logger.info("test: acc:{} loss:{} ".format( val_acc, text_los)) #保存模型 if acc > 0.5 and val_acc > 0.5 and val_acc > best_acc_val: last_improved = total_batch best_acc_val = val_acc checkpoint_path = os.path.join(FLAGS.checkpoints_path, 'checkpoints') saver = tf.train.Saver( tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) saver.save(sess, checkpoint_path, global_step=step) if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: time_dif = get_time_dif(start_time) logger.info('训练结束:{}'.format(time_dif)) break
# 好了。。。开始作死添加东西吧 # 读取所有文件并统计词频 word_collection = [] for text, _ in read_text(data_path, args.max_input_len, args.max_output_len): word_collection.extend([i for i in text]) # counter_vocab = Counter(word_collection).most_common(config.new_vocab_size) counter_vocab = [word[0] for word in list(Counter(word_collection).items()) if word[1] > args.min_count] # build new_vocab _new_vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] _new_vocab.extend(counter_vocab) # 获取到了新的单词表,这样就需要设置一个单词表对应与旧单词表的字典,以及自己本身的一个字典 # 这样就需要读取下旧字典 read = read_vocab(vocab_path) old_vocab = {} for n, i in enumerate(read): old_vocab[i] = n # 看看new_vocab的单词是否全在old_vocab上,如果不在的就删去, 同时获取到new_vocab在old_vocab中对应的id new_vocab = [] vocab_dict = {} token_num = [] for n, word in enumerate(_new_vocab): if word in old_vocab: new_vocab.append(word) vocab_dict[word] = len(vocab_dict) token_num.append(old_vocab[word]) # 更改下保存的数据并保存为新的模型