def POST(self): web.header('Content-Type', 'application/json') post_data = web.input(_method='post') # print post_data # read the data from request and save into file file_id = np.random.randint(1000000, 2000000) input_path = os.path.join("tmp", "user.%i.input" % file_id) with codecs.open(input_path, "w", "utf-8") as f: f.write(post_data["text"]) test_sentences = loader.load_sentences(input_path, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) test_data = prepare_dataset3( test_sentences, word_to_id, char_to_id, model.tag_maps, model.feature_maps, lower ) # print test_data[0] out_sentences = predict_multilayer(parameters, f_eval, test_sentences, test_data, model.tag_maps, None) text = "" # predictions_list = [p.split("\t") for p in predictions] text = " ".join([line[0] for s in out_sentences for line in s]) data = {"sentences": out_sentences, "text": text} return json.dumps(data, indent=4, sort_keys=True, encoding="utf-8")
def load_gramcnn(): #load parameters #print '------params----' opts, parameters, model_name = load_object('main_params.pkl') #prep for gram-cnn #print '------gram-cnn params----' lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] word_to_id, char_to_id, tag_to_id, pt_to_id, dico_words, id_to_tag = reload_mappings( os.path.join(models_path, model_name, 'mappings.pkl')) if os.path.isfile(opts.test): test_sentences = loader.load_sentences(opts.test, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) if os.path.isfile(opts.test): test_data, m3 = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, pt_to_id, lower) max_seq_len = m3 if m3 > 200 else 200 word_emb_weight = np.zeros((len(dico_words), parameters['word_dim'])) n_words = len(dico_words) #print '------gramcnn model----' print ' [*] Loading GRAMCNN tensorflow model (3min)...' gramcnn = GRAMCNN( n_words, len(char_to_id), len(pt_to_id), use_word=parameters['use_word'], use_char=parameters['use_char'], use_pts=parameters['pts'], num_classes=len(tag_to_id), word_emb=parameters['word_dim'], drop_out=0, word2vec=word_emb_weight, feature_maps=parameters['num_kernels'], #,200,200, 200,200], kernels=parameters['kernels'], hidden_size=parameters['word_lstm_dim'], hidden_layers=parameters['hidden_layer'], padding=parameters['padding'], max_seq_len=max_seq_len) #print '------gramcnn load----' gramcnn.load(models_path, model_name) compilation = [ opts, id_to_tag, word_to_id, char_to_id, tag_to_id, pt_to_id, lower, max_seq_len ] print ' [*] Finished loading.' return compilation, parameters, gramcnn
def test(): test_sentences = loader.load_data(test_path, zeros=False) loader.update_tag_scheme(test_sentences, 'iob') test_data = loader.pepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id) print("%i sentences in test." % (len(test_data))) confusion_matrix = torch.zeros((len(tag_to_id) - 2, len(tag_to_id) - 2)) for data in test_data: sentence_in = data['words'] sentence_in = torch.tensor(sentence_in, dtype=torch.long) cap_in = data['caps'] cap_in = torch.tensor(cap_in, dtype=torch.long) tags = data['tags'] tags = torch.tensor(tags) chars2 = data['chars'] chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True) d = {} for i, ci in enumerate(chars2): for j, cj in enumerate(chars2_sorted): if ci == cj and not j in d and not i in d.values(): d[j] = i continue chars2_length = [len(w) for w in chars2_sorted] char_maxl = max(chars2_length) chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int') for i, c in enumerate(chars2_sorted): chars2_mask[i, :chars2_length[i]] = c chars2_mask = torch.tensor(chars2_mask, dtype=torch.long) val, out = model(sentence_in, chars2_mask, cap_in, chars2_length, d) predicted_id = out for (true_id, pred_id) in zip(tags, predicted_id): confusion_matrix[true_id, pred_id] += 1 num_correct = 0 for i in range(confusion_matrix.size(0)): num_correct += confusion_matrix[i][i] accurency = num_correct / np.sum(confusion_matrix.data.numpy()) print(accurency.data)
def test(): make_path(FLAGS) config = load_config(FLAGS.config_file) with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) update_tag_scheme(test_sentences, FLAGS.tag_schema) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) test_manager = BatchManager(test_data, 100) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) os.environ["CUDA_VISIBLE_DEVICES"] = "3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def predict_a_file(test_file, out_file, model, add_o_tag): assert os.path.isfile(test_file) model = Model(model_path=model) parameters = model.parameters # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = model.parameters['tag_scheme'] # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] print 'Reloading previous model...' _, f_eval = model.build(training=False, **parameters) model.reload() test_sentences = loader.load_sentences(test_file, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) test_data = prepare_dataset2(test_sentences, word_to_id, char_to_id, tag_to_id, model.feature_maps, lower) print "input: ", test_file, ":", len(test_sentences), len(test_data) print "output: ", out_file predict(parameters, f_eval, test_sentences, test_data, model.id_to_tag, out_file, add_O_tags=add_o_tag)
def train(): #加载训练用的数据 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) #加载验证集和测试集合 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES)使用选定的标记方案I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) _c, char_to_id, id_to_char = char_mapping( train_sentences, FLAGS.lower) #统计每个字的频率以及为每个字分配一个id _t, tag_to_id, id_to_tag = tag_mapping( train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) #统计每个命名实体的频率以及为每个命名实体分配一个id #将字典写入pkl文件中 with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) #准备数据,获取包含索引的列表集合,得到用于输入网络进行训练的数据 train_data = prepare_dataset( # train_data[0][0]:一句话;train_data[0][1]:单个字的编号;train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2.。。,2,3; train_data[0][3]:每个字的标签 train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) train_manager = BatchManager( train_data, FLAGS.batch_size) # 将数据拆分成以60句话为一个batch,得到一个可迭代对象 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) config = config_model(char_to_id, tag_to_id) #补全参数配置 #限制GPU的使用 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, load_word2vec, config, id_to_char) saver = tf.train.Saver() # 用于保存模型 with tf.device("/cpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step( sess, True, batch) # 按批次训练模型 这个是训练的开始,可以从这里倒着找整个网络怎么训练 #每训练5次做一次验证并计算模型的f1 if (i + 1) % 1 == 0: f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag) print("验证集的F1系数:", f1) #每训练20次保存一次模型 if (i + 10) % 1 == 0: saver.save(sess, save_path=FLAGS.ckpt_path)
def main(): os.environ['PYTHONHASHSEED'] = str(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True train_sentences = load_sentences(args.train_file) dev_sentences = load_sentences(args.dev_file) test_sentences = load_sentences(args.test_file) update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) update_tag_scheme(dev_sentences, args.tag_schema) with open(args.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) train_manager = BatchManager(train_data, args.batch_size, args.num_steps) dev_manager = BatchManager(dev_data, 100, args.num_steps) test_manager = BatchManager(test_data, 100, args.num_steps) if args.cuda >= 0: torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device(args.cuda) else: device = torch.device('cpu') print("device: ", device) if args.train: train(id_to_char, id_to_tag, train_manager, dev_manager, device) f1, res_info = eval_model(id_to_char, id_to_tag, test_manager, device, args.log_name) log_handler.info("\n resinfo {} \v F1: {} ".format(res_info, f1))
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim): #results File resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/" for u_dropout in dropout: for v_char_dim in char_dim: for w_char_lstm_dim in char_lstm_dim: for x_word_dim in word_dim: for y_word_lstm_dim in word_lstm_dim: for dataset in datasets: print "+++++++++++++++" print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset parameters['dropout'] = u_dropout parameters['char_dim'] = v_char_dim parameters['char_lstm_dim'] =w_char_lstm_dim parameters['word_dim'] = x_word_dim parameters['word_lstm_dim'] = y_word_lstm_dim # If dataset is DrugBank assign predefined path if(dataset == "i2b2-2010"): opts.train = i2b2BasePath+"train.txt" opts.dev = i2b2BasePath+ "dev.txt" opts.test = i2b2BasePath+ "test.txt" resultsFile = resultsPath +"i2b2_2010_Results.txt" # Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) print "Calling the prepare_dataset :--" # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters) # Reload previous model values if opts.reload: print 'Reloading previous model...' model.reload() # Train network # singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) n_epochs = 2 # number of epochs over the training set freq_eval = 1000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) #if i % 50 == 0 and i > 0 == 0: # print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) print "Score on dev: %.5f" % dev_score print "Score on test: %.5f" % test_score if dev_score > best_dev: best_dev = dev_score print "New best score on dev."+str(best_dev) # print "Saving model to disk..." # model.save() if test_score > best_test: best_test = test_score print "New best score on test."+str(best_test) # print "Config values used are : " print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)) # Write the best dev and test scores to the file del model with open(resultsFile, 'a') as f: f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim: |"+str(parameters['char_dim'])+ "| char_lstm_dim: "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n") return
#check1word(test_sentences); #merge dev to train totalSentences = train_sentences + dev_sentences #redefine train and dev #corpus are already random genergated, so no need to shuffly #random.seed(SEED); #random.shuffle(totalSentences); devRatio = 0.1 #doc 10% != sentence 10% #devBoundary=int(len(totalSentences)*(1-devRatio)) train_sentences = totalSentences[:devBoundary] dev_sentences = totalSentences[devBoundary:] # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme, tagFilter) update_tag_scheme(dev_sentences, tag_scheme, tagFilter) update_tag_scheme(test_sentences, tag_scheme, tagFilter) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list( itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not parameters['all_emb'] else None) else:
def train_new(): train_sent = load_sentences(FLAGS.filepath) update_tag_scheme(train_sent, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower) print("random embedding") # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sent) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 数据准备,划分验证集和训练集 np.random.seed(10) train_sent_ = np.array(train_sent) shuffle_indices = np.random.permutation(np.arange(len(train_sent))) sent_shuffled = train_sent_[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent))) train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[ dev_sample_index:] train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train." % (len(train_data), len(dev_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = FLAGS.log_file logger = get_logger(log_path) print_config(config, logger) # 根据需求,设置动态使用GPU资源 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: fig = plt.figure() ax = fig.add_subplot(211) ax2 = fig.add_subplot(212) plt.grid(True) plt.ion() model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % 20 == 0: ax.scatter(step, np.mean(loss), c='b', marker='.') plt.pause(0.001) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) ax2.scatter(i + 1, f1, c='b', marker='.') plt.pause(0.001) if best: save_model(sess, model, FLAGS.ckpt_path, logger, "best")
def train(): # load data sets datasets = load_sentences(FLAGS.train_file, FLAGS.lower) random.shuffle(datasets) train_sentences = datasets[:14000] test_sentences = datasets[14000:] # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train / dev." % (len(train_data), len(test_data))) elmo_batcher = get_batcher() train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher) test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: elmo_model = load_elmo() model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "test", test_manager, id_to_tag, logger) # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger)
def main(): # load data sets global args args = parser.parse_args() pp.pprint(vars(args)) # running_name = 'X' use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu) # use_cuda = False # train_file = 'data/example.train' # dev_file = 'data/example.dev' test_file = 'data/example.test' # embedding_file = 'data/vec.txt' map_file = 'map.pkl' # config_file = 'config_file_pytorch' tag_file = 'tag.pkl' # embedding_easy_file = 'data/easy_embedding.npy' # train_sentences = load_sentences(train_file) # dev_sentences = load_sentences(dev_file) test_sentences = load_sentences(test_file) # train_sentences = dev_sentences # update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) # update_tag_scheme(dev_sentences, args.tag_schema) if not os.path.isfile(tag_file): print("Tag file {:s} Not found".format(tag_file)) sys.exit(-1) else: with open(tag_file, 'rb') as t: tag_to_id, id_to_tag = pickle.load(t) if not os.path.isfile(map_file): print("Map file {:s} Not found".format(map_file)) # create dictionary for word # dico_chars_train = char_mapping(train_sentences)[0] # dico_chars, char_to_id, id_to_char = augment_with_pretrained( # dico_chars_train.copy(), # embedding_file, # list(itertools.chain.from_iterable( # [[w[0] for w in s] for s in test_sentences]) # ) # ) # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences) # # with open(map_file, "wb") as f: # pickle.dump([char_to_id, id_to_char], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char = pickle.load(f) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) print("{:d} sentences in test.".format(len(test_data))) test_manager = BatchManager(test_data, 1) save_places = dir_utils.save_places(args.eval) # log_path = os.path.join("log", FLAGS.log_file) logger = get_logger( os.path.join(save_places.log_save_dir, 'evaluation-{:d}.txt'.format(args.fileid))) config = config_model(char_to_id, tag_to_id, args) print_config(config, logger) logger.info("start training") #Update: create model and embedding! model = NERModel.CNERPointer(char_dim=args.char_dim, seg_dim=args.seg_dim, hidden_dim=args.hidden_dim, max_length=15, output_classes=4, dropout=args.dropout, embedding_path=None, id_to_word=id_to_char, easy_load=None) print("Number of Params\t{:d}".format( sum([p.data.nelement() for p in model.parameters()]))) #Update: this won't work! # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu) if use_cuda: model = model.cuda() model.eval() if args.eval is not None: # if os.path.isfile(args.resume): ckpt_filename = os.path.join( save_places.model_save_dir, 'checkpoint_{:04d}.pth.tar'.format(args.fileid)) assert os.path.isfile( ckpt_filename), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt_filename, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'], strict=True) train_iou = checkpoint['IoU'] print("=> loading checkpoint '{}', current iou: {:.04f}".format( ckpt_filename, train_iou)) ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5) eval_lines = test_ner(ner_results, save_places.summary_save_dir) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) return f1
def main(argv=None): # pylint: disable=unused-argument # if tf.gfile.Exists(FLAGS.eval_dir): # tf.gfile.DeleteRecursively(FLAGS.eval_dir) # tf.gfile.MakeDirs(FLAGS.eval_dir) # Read parameters from command line opts = read_args(evaluation=True) # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) if not os.path.exists(eval_temp): os.makedirs(eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) event_logs_path = os.path.join(eval_temp, "eval_logs") # if not os.path.exists(event_logs_path): # os.makedirs(event_logs_path) # Initialize model model = MainTaggerModel(parameters=parameters, models_path=models_path, overwrite_mappings=opts.overwrite_mappings) print "MainTaggerModel location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['t_s'] max_sentence_lengths = {} max_word_lengths = {} # Load sentences train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \ loader.load_sentences(opts.train, lower, zeros) dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences( opts.dev, lower, zeros) test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences( opts.test, lower, zeros) global_max_sentence_length, global_max_char_length = \ calculate_global_maxes(max_sentence_lengths, max_word_lengths) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) if opts.overwrite_mappings: print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) model.reload_mappings() # Index data train_buckets, train_stats, train_unique_words = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) dev_buckets, dev_stats, dev_unique_words = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) test_buckets, test_stats, test_unique_words = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_stats), len(dev_stats), len(test_stats)) print "%i / %i / %i words in train / dev / test." % ( sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]), sum([x[0] for x in test_stats])) print "%i / %i / %i longest sentences in train / dev / test." % ( max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]), max([x[0] for x in test_stats])) print "%i / %i / %i shortest sentences in train / dev / test." % ( min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]), min([x[0] for x in test_stats])) for i, label in [[2, 'char']]: print "%i / %i / %i total %s in train / dev / test." % ( sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]), sum([sum(x[i]) for x in test_stats]), label) print "%i / %i / %i max. %s lengths in train / dev / test." % ( max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]), max([max(x[i]) for x in test_stats]), label) print "%i / %i / %i min. %s lengths in train / dev / test." % ( min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]), min([min(x[i]) for x in test_stats]), label) print "Max. sentence lengths: %s" % max_sentence_lengths print "Max. char lengths: %s" % max_word_lengths for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words], ['dev', dev_stats, dev_unique_words], ['test', test_stats, test_unique_words]]: int32_items = len(train_stats) * ( max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1) float32_items = n_unique_words * parameters['word_dim'] total_size = int32_items + float32_items logging.info("Input ids size of the %s dataset is %d" % (label, int32_items)) logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % ( n_unique_words, label, float32_items)) logging.info("Total size of the %s dataset is %d" % (label, total_size)) batch_size = 5 # Build the model cost, train_step, tag_scores, tag_ids, word_ids, \ crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build( max_sentence_length_scalar=global_max_sentence_length, max_word_length_scalar=global_max_char_length, batch_size_scalar=batch_size, **parameters) FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', event_logs_path, """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path, """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") evaluate(model, dev_buckets, test_buckets, FLAGS, opts, id_to_tag, batch_size, placeholders, enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths, FLAGS.eval_dir, tag_scheme)
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag schema(IOB / IOBES) I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据 # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) # with open('maps.txt','w',encoding='utf8') as f1: # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # # prepare data, get a collection of list containing index # train_data[0][0]:一句话; # train_data[0][1]:单个字的编号; # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3; # train_data[0][3]:每个字的标签 train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) # 按batch size将数据拆分 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0) with tf.device("/cpu:0"): for i in range(100): # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练 for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) # 打印信息: # iteration:迭代次数,也就是经过多少个epoch; # if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) # 加载bert模型的参数设置 if FLAGS.max_seq_length > bert_config.max_position_embeddings: # 限制ner的max_seq_length不大于bert的最大长度限制512 raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels( ) # 获取label标签["O", "B-DIS", "I-DIS", "X", "[CLS]", "[SEP]"] tokenizer = tokenization.FullTokenizer( # 对vocab的初始处理,包括word:id,大小写等 vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: # use_tpu 默认为False tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS. save_checkpoints_steps, # how often to save the model checkpoint. 1000 tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, # 1000 num_shards=FLAGS.num_tpu_cores, # 8 per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None # warm up 步数的比例,比如说总共学习100步,warmup_proportion=0.1表示前10步用来warm up,warm up时以 # 较低的学习率进行学习(lr = global_step/num_warmup_steps * init_lr),10步之后以正常(或衰减)的学习 # 率来学习。 ################## train_sentences = load_sentences( os.path.join(FLAGS.data_dir, "ner.train"), FLAGS.lower, FLAGS.zeros) # 加载训练数据,格式为二维list,外层存储每一句话,内层为每句话的一个字和对应的tag dev_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"), FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(os.path.join(FLAGS.data_dir, "ner.dev"), FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) # 默认IOBES,更新tag方案,将IOB转化为IOBES update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 默认IOBES,更新tag方案,将IOB转化为IOBES update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(map_file): # create dictionary for word if FLAGS.pre_emb: # use pre-trained embedding dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 为了保证训练集中未出现的测试集中的字至少也能用预训练的word embedding dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences ]) # 将嵌套的列表拼接 )) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) # 执行mark_mapping _c, mark_to_id, id_to_mark = mark_mapping(train_sentences) entropy_dict = load_entropy_dict(FLAGS.entropy_dict) with open(map_file, "wb") as f: pickle.dump([ char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict ], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, mark_to_id, id_to_mark, entropy_dict = pickle.load( f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, mark_to_id, entropy_dict, FLAGS.lower) ############### if FLAGS.do_train: train_examples = processor.get_train_examples( FLAGS.data_dir, train_data) # 返回的每一个元素是一个InputExample对象 num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS. init_checkpoint, # 将预训练的bert模型的参数加载到模型中作为fine-tuning的初始化参数 learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) token_path = os.path.join(FLAGS.output_dir, "token_test.txt") with open(FLAGS.output_dir + '/label2id.pkl', 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} if os.path.exists(token_path): os.remove(token_path) predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") # batch_labels 是以句为单位的[[1,2,0,0,1,2],[...]] batch_tokens, batch_labels = filed_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") for actual_train_step in list(range(1000, num_train_steps, 2000)) + [num_train_steps]: if FLAGS.do_train: start = time.clock() tf.logging.info("start training time: %f", start) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", actual_train_step) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=actual_train_step) end = time.clock() tf.logging.info("end training time: %f", end) tf.logging.info("training time: %f", end - start) if FLAGS.do_eval: start = time.clock() tf.logging.info("start evaluation time: %f", start) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) end = time.clock() tf.logging.info("end evaluation time: %f", end) tf.logging.info("evaluation time: %f", end - start) if FLAGS.do_predict: start = time.clock() tf.logging.info("start predict time: %f", start) tf.logging.info("***** Running prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) _result = [] for prediction in result: _result += [prediction_id for prediction_id in prediction] output_predict_file = os.path.join( FLAGS.output_dir + "/label_test/", "label_test.txt-" + str(actual_train_step)) Writer(output_predict_file, _result, batch_tokens, batch_labels, id2label) end = time.clock() tf.logging.info("end predict time: %f", end) tf.logging.info("predict time: %f", end - start)
def train(): # load data sets:返回的是语料集的[['字','标'],...]元组 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # 由loader.py负责处理数据 # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 判断是否用之前训练好的词向量 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] # dico_chars_train应该只接收了dico <注意后面的[0]> ,即训练数据的不重复统计的字集 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences])) # chain.from_iterable(iterables): 一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列 # 所以这里的list生成的就是test_sentences里的字集 ) # 这里dico_chars是在train_set字典基础上添加wiki_100中包含的test_set里的字构成的字典 else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) # 通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去,永久存储。 else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) # xxx_data 以句子为单位存储[字符,字符id,标签id/chars长度的全是“0”对应标签id的list <train = True/False>,标签] print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) # 默认的batch_size为20 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # 定义了3个BatchManager类:这个类中包含batch_data和len_data # batch_data 是按句子长短顺序排序后一个batch大小的data列表数据,而且每个batch中的数据都padding到统一长短 # len_data 是所分batch的数量 # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto( ) # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置 tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(self, n_epochs=100, freq_eval=1000, verbose=True, eval_test_set=False): """ :param n_epochs: number of epochs over the training set :param freq_eval: evaluate on dev every freq_eval steps :return: Saves the model with the best F1-Score, evaluated on the dev set """ # Initialize model model = Model(parameters=self.parameters, models_path=models_path) print("Model location: %s" % model.model_path) # Data parameters lower = self.parameters['lower'] zeros = self.parameters['zeros'] tag_scheme = self.parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(self.parameters['train'], lower, zeros) dev_sentences = loader.load_sentences(self.parameters['dev'], lower, zeros) test_sentences = loader.load_sentences(self.parameters['test'], lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if self.parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), self.parameters['pre_emb'], list( itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not self.parameters['all_emb'] else None) else: dico_words, word_to_id, id_to_word = word_mapping( train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # Save the mappings to disk print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**self.parameters) # Reload previous model values if self.parameters['reload']: print('Reloading previous model...') model.reload() # # Train network # singletons = set( [word_to_id[k] for k, v in dico_words_train.items() if v == 1]) best_dev = -np.inf best_test = -np.inf count = 0 for epoch in range(n_epochs): epoch_costs = [] print("Starting epoch %i at..." % epoch, time.ctime()) for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], self.parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0 and verbose: print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))) if count % freq_eval == 0: dev_score = evaluate(self.parameters, f_eval, dev_sentences, dev_data, id_to_tag, verbose=verbose) if eval_test_set: test_score = evaluate(self.parameters, f_eval, test_sentences, test_data, id_to_tag, verbose=verbose) print("Score on dev: %.5f" % dev_score) if eval_test_set: print("Score on test: %.5f" % test_score) if dev_score > best_dev: best_dev = dev_score print("New best score on dev.") print("Saving model to disk...") model.save() if eval_test_set: if test_score > best_test: best_test = test_score print("New best score on test.") print( "Epoch %i done. Average cost: %f. Ended at..." % (epoch, np.mean(epoch_costs)), time.ctime()) return best_dev
def train(): train_sentences = load_sentences(FLAGS.train_file) dev_sentences = load_sentences(FLAGS.dev_file) test_sentences = load_sentences(FLAGS.test_file) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) train_manager = BatchManager(train_data, FLAGS.batch_size, FLAGS.num_steps) dev_manager = BatchManager(dev_data, 100, FLAGS.num_steps) test_manager = BatchManager(test_data, 100, FLAGS.num_steps) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) os.environ["CUDA_VISIBLE_DEVICES"] = "3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(75): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{},".format( iteration, step % steps_per_epoch, steps_per_epoch)) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]" dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) # print("train_sentences[0]:{}".format(train_sentences[0])) # "train_sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'E-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'E-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'E-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'E-TYPE']]" update_tag_scheme(dev_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist # print("map_file:{}".format(FLAGS.map_file)) # print("pre_emb:{}".format(FLAGS.pre_emb)) # map_file: maps.pkl # pre_emb: False if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping( train_sentences, FLAGS.lower)[0] # character -> count dict dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) # '3027 / 0 / 361 sentences in train / dev / test.' # print("batch_size:{}".format(FLAGS.batch_size)) # batch_size: 20 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) # print("config_file:{}".format(FLAGS.config_file)) # config_file: config_file if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) # log_path:log/train.log logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data # print("steps_per_epoch:{}".format(steps_per_epoch)) # steps_per_epoch: 152 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) # print("steps_check:{}".format(FLAGS.steps_check)) # steps_check: 100 if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) export(model, sess, "ner", "export_model")
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) #训练集 101218 句子 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) #验证集 7827句子 test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) #测试集 16804句子 # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes update_tag_scheme(test_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes update_tag_scheme(dev_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes # create maps if not exist if not os.path.isfile(FLAGS.map_file): #判断maps.pkl是否存在 # create dictionary for word if FLAGS.pre_emb: #是否使用预先训练的模型(训练好的字向量) 测试集的数据不在训练集中 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] #字频统计下来 dico_chars dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( #拉平,变成一个list [[w[0] for w in s] for s in test_sentences]) #w[0] 是个字 )) #每个字建个字典,每个词建个字典 else: #每个字的id,标记的id _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags 每个标记的id _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) #字频,排序,写入文件 #with open('maps.txt','w',encoding='utf8') as f1: #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: #持久化下来 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( #字词 数字特征化 train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) #训练集每次60个句子进行迭代 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) #创建文件log,result,ckpt if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) #字符对应的id,标签对应的id save_config(config, FLAGS.config_file) #每次的数据不一样都要生成一个config_file, make_path(FLAGS) #创建文件log,result,ckpt 模型中的文件 log_path = os.path.join("log", FLAGS.log_file) #读取log路径 logger = get_logger(log_path) #定义log日志的写入格式 print_config(config, logger) #写入log日志 # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True #设置GPU自适应,用多少使用多少 #tf_config.gpu_options.per_process_gpu_memory_fraction=True 设置GPU的使用率,占比 steps_per_epoch = train_manager.len_data #总共分多少批,取多少次 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) #模型初始化结束 logger.info("start training") loss = [] # with tf.device("/gpu:0"):没有Gpu注释掉 卷积神经网络要求句子的长度一样, for i in range(100): #迭代多少次,每次把数据拿过来 for batch in train_manager.iter_batch(shuffle=True): #随机的拿 step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)比上次模型好的话,就保存 if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
os.makedirs(eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) # Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Save the mappings to disk print 'Saving the mappings to disk...' # how this work, should have a mapping in disk and load every time? model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters)
model.reload() from tabulate import tabulate embeddings_params, model_params = model.count_parameters() print tabulate(embeddings_params, ["name", "shape", "size"]) print tabulate(model_params, ["name", "shape", "size"]) print "Parameters: \n *", "\n * ".join([str(k) + " = " + str(v) for k, v in model.parameters.items()]) # print "Parameters: \n", model.parameters test_file = opts.test_file out_file = opts.out_file test_sentences = loader.load_sentences(test_file, lower, zeros) update_tag_scheme(test_sentences, tag_scheme) test_data = prepare_dataset2( test_sentences, word_to_id, char_to_id, tag_to_id, model.feature_maps, lower ) print "input: ", test_file, ":", len(test_sentences), len(test_data) print "output: ", out_file import datetime t1 = datetime.datetime.now() predict(parameters, f_eval, test_sentences, test_data, model.id_to_tag, out_file, add_O_tags=opts.add_o_tag) t2 = datetime.datetime.now() print "done in ", (t2-t1).total_seconds(), "seconds"
def train(): # load data sets train_sentences = load_sentences( FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dimension:num_sentence*len_sentence*2 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme( train_sentences, FLAGS.tag_schema) # dimension:num_sentence*len_sentence*2 update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 如果使用预训练的词嵌入 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[ 0] # dico_chars_train dimension: 训练数据集中出现的字符类别数*2, dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 利用测试数据样本集中的字对dico_chars_train进行补充 dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: # 创建map_file文件 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower) # dimension: NumSentence*4*LenSentence dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager( train_data, FLAGS.batch_size ) # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): # 若已有config_file则读取加载 config = load_config(FLAGS.config_file) else: # 若没有config_file则新建并保存为文件 config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 将config打印到日志文件 # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 动态申请内存 steps_per_epoch = train_manager.len_data # len_data: ceil(NumSentence/BatchSize) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): # 括号中数字是epoach数量 for batch in train_manager.iter_batch( shuffle=True ): # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序 step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) # View the tensorboard graph by running the following code and then going to the terminal and typing: # tensorboard --logdir = tensorboard_logs merged = tf.summary.merge_all() if not os.path.exists('tensorboard_logs/'): os.makedirs('tensorboard_logs/') my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): #print batch step, batch_loss = model.run_step(sess, True, batch) #print step loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
# Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else:
def train(): # load data sets # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) train_sentences, dev_sentences = split_train_dev(all_train_sentences) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars_train = char_mapping(all_train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(all_train_sentences, FLAGS.lower) # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences) # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09') #l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp) l_sorted_lexcion = [] # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) max_len = max( [len(sentence[0]) for sentence in train_data + test_data + dev_data]) train_manager = BatchManager(train_data, FLAGS.batch_size, max_len) dev_manager = BatchManager(dev_data, 800, max_len) test_manager = BatchManager(test_data, 800, max_len) # random.shuffle(train_data) # pad_test_data = pad_data(test_data) # pad_dev_data = pad_data(dev_data) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id, max_len) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): random.shuffle(train_data) pad_train_data = pad_data(train_data, max_len) strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data for j in range(0, len(strings), FLAGS.batch_size): batch = [ strings[j:j + FLAGS.batch_size], chars[j:j + FLAGS.batch_size], lexcion_teatures[j:j + FLAGS.batch_size], pos_ids[j:j + FLAGS.batch_size], dep_ids[j:j + FLAGS.batch_size], head_ids[j:j + FLAGS.batch_size], targets[j:j + FLAGS.batch_size] ] step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "AS loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger, i) evaluate(sess, model, "test", test_manager, id_to_tag, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
raise Exception('CoNLL evaluation script not found at "%s"' % utils.eval_script) if not os.path.exists(utils.eval_temp): os.makedirs(utils.eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) lower = parameters["lower"] zeros = parameters["zeros"] tag_scheme = parameters["tag_scheme"] train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) test_train_sentences = loader.load_sentences(opts.test_train, lower, zeros) loader.update_tag_scheme(train_sentences, tag_scheme) loader.update_tag_scheme(dev_sentences, tag_scheme) loader.update_tag_scheme(test_sentences, tag_scheme) loader.update_tag_scheme(test_train_sentences, tag_scheme) dico_words_train = loader.word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = loader.augment_with_pretrained( dico_words_train.copy(), parameters["pre_emb"], list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not parameters["all_emb"] else None, ) dico_chars, char_to_id, id_to_char = loader.char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) # 检测并维护数据集的 tag 标记 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件 if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags # 获取标记与位置映射关系 _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) #with open('maps.txt','w',encoding='utf8') as f1: #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 提取句子特征 # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 获取可供模型训练的单个批次数据 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 训练集全量跑一次需要迭代的次数 steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: # 此处模型创建为项目最核心代码 model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] with tf.device("/gpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
# Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, zeros) dev_sentences = loader.load_sentences(opts.dev, zeros) test_sentences = loader.load_sentences(opts.test, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else:
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist, load data if exists maps if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 设置训练日志目录 train_log = os.path.join(FLAGS.logdir, "train") if not os.path.exists(train_log): os.makedirs(train_log) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data # the nums of batch data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # 观察所建立的计算图 train_writer = tf.summary.FileWriter(train_log, sess.graph) logger.info("start training") loss = [] dev_f1 = [] test_f1 = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss, merged = model.run_step( sess, True, batch) # step是global step # 在迭代中输出到结果 train_writer.add_summary(merged, step) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # use dev data to validation the model best, dev_f1_value = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) # store the dev f1 dev_f1.append(dev_f1_value) if best: save_model(sess, model, FLAGS.ckpt_path, logger) # use current the model to test _, test_f1_value = evaluate(sess, model, "test", test_manager, id_to_tag, logger) # store the test f1 test_f1.append(test_f1_value) # write the dev_f1 and test_f1 to file f1_result = {} f1_result["dev_f1"] = dev_f1 f1_result["test_f1"] = test_f1 write_data_to_file(f1_result, "f1_result")
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) #if best: save_model(sess, model, FLAGS.ckpt_path, logger)
def train(X_train,X_dev,X_test): # load data sets train_sentences = X_train dev_sentences = X_dev test_sentences = X_test train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros) dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros) test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros) train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros) dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros) test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros) train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros) dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros) test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(train_sentences_loc, FLAGS.tag_schema) update_tag_scheme(test_sentences_loc, FLAGS.tag_schema) update_tag_scheme(train_sentences_per, FLAGS.tag_schema) update_tag_scheme(test_sentences_per, FLAGS.tag_schema) update_tag_scheme(train_sentences_org, FLAGS.tag_schema) update_tag_scheme(test_sentences_org, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0] dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained( dico_chars_train_loc.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_loc]) ) ) dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0] dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained( dico_chars_train_per.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_per]) ) ) dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0] dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained( dico_chars_train_org.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_org]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower) _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower) _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc) _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per) _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data),len(dev_data), len(test_data))) train_data_loc = prepare_dataset_ner( train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) dev_data_loc = prepare_dataset_ner( dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) test_data_loc = prepare_dataset_ner( test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) print("%i / %i / %i sentences_loc in train / dev / test." % ( len(train_data_loc), len(dev_data_loc), len(test_data_loc))) train_data_per = prepare_dataset_ner( train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) dev_data_per = prepare_dataset_ner( dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) test_data_per = prepare_dataset_ner( test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) print("%i / %i / %i sentences_per in train / dev / test." % ( len(train_data_per), len(dev_data_per), len(test_data_per))) train_data_org = prepare_dataset_ner( train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) dev_data_org = prepare_dataset_ner( dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) test_data_org = prepare_dataset_ner( test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) print("%i / %i / %i sentences_org in train / dev / test." % ( len(train_data_org), len(dev_data_org), len(test_data_org))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size) train_manager_per = BatchManager(train_data_per, FLAGS.batch_size) train_manager_org = BatchManager(train_data_org, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data steps_per_epoch_loc = train_manager_loc.len_data steps_per_epoch_per = train_manager_per.len_data steps_per_epoch_org = train_manager_org.len_data model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger) with tf.Session(config=tf_config, graph = model.graph ) as sess: sess.run(tf.global_variables_initializer()) if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights_ner = sess.run(model.char_lookup.read_value()) emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"], emb_weights, emb_weights_ner) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("start training") loss = [] loss_loc = [] loss_per = [] loss_org = [] for i in range(100): for batch_loc in train_manager_loc.iter_batch(shuffle=True): step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc) loss_loc.append(batch_loss_loc) if step_loc % FLAGS.steps_check == 0: iteration_loc = step_loc // steps_per_epoch_loc + 1 logger.info("iteration:{} step_loc:{}/{}, " "NER loss:{:>9.6f}".format( iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc))) loss_loc = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_1 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_loc_dev = model.precision(sess, dev_manager, id_to_tag) precision_loc_test = model.precision(sess, test_manager, id_to_tag) for batch_per in train_manager_per.iter_batch(shuffle=True): step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per) loss_per.append(batch_loss_per) if step_per % FLAGS.steps_check == 0: iteration_per = step_per // steps_per_epoch_per + 1 logger.info("iteration:{} step_per:{}/{}, " "NER loss:{:>9.6f}".format( iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per))) loss_per = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_2 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_per_dev = model.precision(sess, dev_manager, id_to_tag) precision_per_test = model.precision(sess, test_manager, id_to_tag) for batch_org in train_manager_org.iter_batch(shuffle=True): step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org) loss_org.append(batch_loss_org) if step_org % FLAGS.steps_check == 0: iteration_org = step_org // steps_per_epoch_org + 1 logger.info("iteration:{} step_org:{}/{}, " "NER loss:{:>9.6f}".format( iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org))) loss_org = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_3 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_org_dev = model.precision(sess, dev_manager, id_to_tag) precision_org_test = model.precision(sess, test_manager, id_to_tag) best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger) with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re: st_re.write(str(results).replace("[", "").replace("]", "")) st_re.write("\n")