def run(operation): """ create by ljx 选择对模型的操作,包括训练和测试 :param operation: :return: """ if operation == 'train': train_data = read_corpus(args.train_data) test_data = read_corpus(args.test_data) train(train_data, test_data) if operation == 'test': chk_file = tf.train.latest_checkpoint(params.store_path) test_data = read_corpus(args.test_data) test(test_data, chk_file)
def make_batch(): data = read_corpus(path_test) batch_x = [] batch_y = [] for (x, y) in data: batch_x.append(x) batch_y.append(y) if len(batch_x) >= batch_size: yield batch_x, batch_y batch_x = [] batch_y = [] if len(batch_x) > 0: yield batch_x, batch_y
def feed_data(self): accuracy = 0.0 data = data_process.read_corpus() batch = data_process.batch_data(data, self.batch_size) for i in range(self.iter_size): batch_x, batch_y, sequence_len = batch.__next__() if i % 100 == 0: # aa = sess.run(self.embedded, feed_dict={self.x: batch_x, self.y: batch_y, self.droput: 0.5, # self.sequence_len: sequence_len}) # bb = tf.reshape(aa, [-1, self.sequence_dim, self.embedding_dim]) # print(bb.shape) # print(sess.run(bb)) summary, y_hat, loss = sess.run( [self.merged, self.y_hat, self.loss], feed_dict={ self.x: batch_x, self.y: batch_y, self.droput: 0.5, self.sequence_len: sequence_len }) accu = self.evaluate(batch_x, batch_y, sequence_len) self.train_writer.add_summary(summary, i) print("y_hat is :", y_hat.shape) print("log loss is : ", loss) if accu > accuracy: self.saver.save(sess, './model/lstm', global_step=i) accuracy = accu self.train_step.run( feed_dict={ self.x: batch_x, self.y: batch_y, self.droput: 0.5, self.sequence_len: sequence_len })
parser.add_argument('--model_path', type=str, default='', help='model_path') args = parser.parse_args() # 训练相关数据准备 train_cn_file, train_en_file = './data/cn.txt', './data/en.txt' cn_word2id_dict, cn_id2word_dict = construct_wordid_dict(train_cn_file, ttype='src') en_word2id_dict, en_id2word_dict = construct_wordid_dict(train_en_file, ttype='tgt') cn_vocab_size, en_vocab_size = len(cn_word2id_dict), len(en_word2id_dict) # print(cn_vocab_size, en_vocab_size) cn_words_embedding = load_pickle_file('./word2vector_cn_embedding120.pkl') en_words_embedding = load_pickle_file('./word2vector_en_embedding120.pkl') cn_train_data = read_corpus(train_cn_file, ttype='src') en_train_data = read_corpus(train_en_file, ttype='tgt') # 测试相关数据准备 dev_cn_file, dev_en_file = './data/cn_dev.txt', './data/en_dev.txt' cn_dev_data = read_corpus(dev_cn_file, ttype='src') en_dev_data = read_corpus(dev_en_file, ttype='tgt') # 每次进行训练时生成一个时间戳 if args.is_training: print("\n训练模式 .... \n") args.is_training = True timestamp = str( time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time()))) else: pass
# 理解数据(可视化分析/统计信息) # # 对数据的理解是任何AI工作的第一步,需要充分对手上的数据有个更直观的理解。 # TODO: 统计一下qlist中每个单词出现的频率,并把这些频率排一下序,然后画成plot. 比如总共出现了总共7个不同单词,而且每个单词出现的频率为 4, 5,10,2, 1, 1,1 # 把频率排序之后就可以得到(从大到小) 10, 5, 4, 2, 1, 1, 1. 然后把这7个数plot即可(从大到小) # 需要使用matplotlib里的plot函数。y轴是词频 from data_process import read_corpus from collections import Counter import matplotlib.pyplot as plt qlist, alist = read_corpus('./data/train-v2.0.json') words_cnt = Counter() for text in qlist: # 统计词频 words_cnt.update(text.strip(' .!?').split()) value_sort = sorted(words_cnt.values(), reverse=True) plt.subplot(221) plt.plot(value_sort) plt.subplot(222) plt.plot(value_sort[:2000]) plt.subplot(223) plt.plot(value_sort[:200]) plt.subplot(224) plt.plot(value_sort[:20]) plt.show() # 显示词频最高前10词,因为只取高频值,所以value转换时重合的概率较小,即时重合也没有太大影响 inverse = dict(zip(words_cnt.values(), words_cnt.keys())) print("词数(type):%d" % len(words_cnt))
total_epoch_acc += acc_val.item() total_epoch_pre += pre_val total_epoch_f1 += f1_val print(f"validation in batch:{bat_num+1}\n") model.train() return total_epoch_loss / (bat_num + 1), total_epoch_acc / ( bat_num + 1), total_epoch_f1 / (bat_num + 1), total_epoch_pre / ( bat_num + 1), total_epoch_recall / (bat_num + 1) if __name__ == '__main__': opt = DefaultConfig() # 数据文件路径 data_path = opt.data_path data = data_process.read_corpus(data_path) timestamp = str(int(time.time())) outdir = os.path.abspath( os.path.join(os.path.curdir, "checkpoints", timestamp)) if not os.path.exists(outdir): os.makedirs(outdir) vocab = data_process.load_vocab(opt.vocab_path) tag2label = { "O": 0, "B-W": 1, "I-W": 2, } tag2label[START_TAG] = len(tag2label) tag2label[STOP_TAG] = len(tag2label) print("映射word and tag to id")
# encoding=utf-8 from gensim.models import word2vec import data_process import numpy as np import pickle data = data_process.read_corpus('source.txt') sentences = [x[0] for x in data] print("constructing model ...") model = word2vec.Word2Vec(sentences, min_count=1, size=120) print("read wordid...") words_id, _ = data_process.load_wordid("word2id.pkl") embedding = [] print("construct vectors...") # 根据word_id中词的顺序构建起词向量 embedding = [model[word] for word in words_id] embedding = np.array(embedding) print(embedding.shape) # 将词向量写入pickle文件,便于以后直接加载 with open('data/word2vector_source_embedding120.pkl', 'wb') as fw: pickle.dump(embedding, fw) # words_embedding,_ = data_process.load_wordid("word2vector_source_embedding100.pkl") # print (words_embedding.shape)