def init_emb_weights(shape, id_to_char): #emb_shape = (len(id_to_char), 100) initializer = Init.xavier_initializer() emb_weights = Kb.variable(initializer(shape), dtype=None, name="char_embedding") tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) emb_weights = sess.run(emb_weights.read_value()) emb_weights = load_word2vec(FLAGS.emb_file, id_to_char, FLAGS.char_dim, emb_weights) return emb_weights
def __init__(self, params): super(SequenceToSequence, self).__init__() self.embedding_matrix = load_word2vec(params) self.params = params self.encoder = rnn_encoder.Encoder(params["vocab_size"], params["embed_size"], params["enc_units"], params["batch_size"], self.embedding_matrix) self.attention = rnn_decoder.BahdanauAttention(params["attn_units"]) self.decoder = rnn_decoder.Decoder(params["vocab_size"], params["embed_size"], params["dec_units"], params["batch_size"], self.embedding_matrix)
def eval_model(id_to_char, id_to_tag, test_manager, device, model_name=None): print("Eval ......") if not model_name: model_name = args.log_name old_weights = np.random.rand(len(id_to_char), args.word_embed_dim) pre_word_embed = load_word2vec("100.utf8", id_to_char, args.word_embed_dim, old_weights) e_model = Model(args, id_to_tag, device, pre_word_embed).to(device) e_model.load_state_dict(torch.load("./models/" + model_name + ".pkl")) print("model loaded ...") e_model.eval() all_results = [] for batch in test_manager.iter_batch(): strs, lens, chars, segs, subtypes, tags, adj, dep = batch chars = torch.LongTensor(chars).to(device) _lens = torch.LongTensor(lens).to(device) subtypes = torch.LongTensor(subtypes).to(device) tags = torch.LongTensor(tags).to(device) adj = torch.FloatTensor(adj).to(device) dep = torch.LongTensor(dep).to(device) logits, _ = e_model(chars, _lens, subtypes, adj, dep) """ Evaluate """ # Decode batch_paths = [] for index in range(len(logits)): length = lens[index] score = logits[index][:length] # [seq, dim] probs = F.softmax(score, dim=-1) # [seq, dim] path = torch.argmax(probs, dim=-1) # [seq] batch_paths.append(path) for i in range(len(strs)): result = [] string = strs[i][:lens[i]] gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lens[i]]]) pred = iobes_iob( [id_to_tag[int(x)] for x in batch_paths[i][:lens[i]]]) for char, gold, pred in zip(string, gold, pred): result.append(" ".join([char, gold, pred])) all_results.append(result) all_eval_lines = test_ner(all_results, args.result_path, args.log_name) res_info = all_eval_lines[1].strip() f1 = float(res_info.split()[-1]) print("eval: f1: {}".format(f1)) return f1, res_info
def get_embedding(self, inputs, id_to_word): # embedding layer for input projection with tf.variable_scope("Embedding"), tf.device('/cpu:0'): if not self.params.pre_emb: embedding = tf.get_variable( "word_emb", [self.num_words, self.params.word_dim], initializer=init_ops.uniform_unit_scaling_initializer()) else: print("load word2vec") embedding = tf.get_variable( "word_emb", dtype=tf.float32, initializer=np.asarray(load_word2vec( self.params.pre_emb, id_to_word), dtype=np.float32)) x = tf.nn.embedding_lookup(embedding, inputs) return x
def train(id_to_char, id_to_tag, train_manager, dev_manager, device): old_weights = np.random.rand(len(id_to_char), args.word_embed_dim) pre_word_embed = load_word2vec("100.utf8", id_to_char, args.word_embed_dim, old_weights) if args.label_weights: label_weights = torch.ones([len(id_to_tag)]) * args.label_weights label_weights[0] = 1.0 # none label_weights = label_weights.to(device) else: label_weights = None model = Model(args, id_to_tag, device, pre_word_embed).to(device) if args.optimizer == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) print("device: ", model.device) MAX_F1 = 0 for epoch in range(args.epoch): log_handler.info("Epoch: {} / {} :".format(epoch + 1, args.epoch)) log_handler.info("epoch {}, lr: {} ".format( epoch + 1, get_learning_rate(optimizer))) loss = train_epoch(model, optimizer, train_manager, label_weights, device) log_handler.info("epoch {}, loss : {}".format(epoch + 1, loss)) f1, dev_model = dev_epoch(epoch, model, dev_manager, id_to_tag, device) log_handler.info("epoch {}, f1 : {}".format(epoch + 1, f1)) if f1 > MAX_F1: MAX_F1 = f1 torch.save(dev_model.state_dict(), "./models/{}.pkl".format(args.log_name)) log_handler.info("epoch {}, MAX_F1: {}\n".format(epoch + 1, MAX_F1)) print()
def interface(self, msg): ckpt = tf.train.get_checkpoint_state(self.ckpt_path) # model = Model(load_config(self.config_file)) logger = get_logger(self.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(self.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) self.model.saver = tf.train.import_meta_graph(basedir + '/ckpt/ner.ckpt.meta') sess = tf.Session(config=tf_config) # with tf.Session(config=tf_config) as sess: with sess.as_default(): # sess.run(tf.global_variables_initializer()) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): # self.model = tf.saved_model.loader.load(sess, ckpt.model_checkpoint_path) # self.model.saver = tf.train.import_meta_graph(basedir + '/ckpt/ner.ckpt.meta') self.model.saver.restore(sess, ckpt.model_checkpoint_path) # self.model.saver.restore(sess, tf.train.latest_checkpoint(basedir + '/ckpt/')) else: logger.info("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) if self.config_file["pre_emb"]: emb_weights = sess.run(self.model.char_lookup.read_value()) emb_weights = load_word2vec(self.config_file["emb_file"], id_to_char, self.config_file["char_dim"], emb_weights) sess.run(self.model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") if msg: result = self.model.evaluate_line( sess, input_from_line(msg, char_to_id), id_to_tag) return result
x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。 config = x.Config(dataset) #进入到对应模型的__init__方法进行参数初始化 start_time = time.time() print("Loading data...") train_data, dev_data, test_data, train_sentences, test_sentences, dev_sentences, word_to_id, id_to_word, tag_to_id, id_to_tag = load_model_dataset( config) config.n_vocab = len(word_to_id) time_dif = data_utils.get_time_dif(start_time) print("Time usage:", time_dif) embedding_pretrained = data_utils.load_word2vec(config, id_to_word) train_X, train_Y = data_utils.get_X_and_Y_data(train_data, config.max_len, len(tag_to_id)) dev_X, dev_Y = data_utils.get_X_and_Y_data(dev_data, config.max_len, len(tag_to_id)) test_X, test_Y = data_utils.get_X_and_Y_data(test_data, config.max_len, len(tag_to_id)) train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y)) train_dataset = train_dataset.shuffle(len(train_X)).batch( config.batch_size, drop_remainder=True) # train
def main(args=None): if args is None: args = load_arg() args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) print_args(args) if args.glove is not None: embedding_vectors, word2id = load_word2vec(args.glove, vocab=args.vocab, use_gensim=False) else: embedding_vectors, word2id = load_word2vec(args.word2vec, vocab=args.vocab, use_gensim=True) train_dataset, dev_dataset, test_dataset = load_tacred_dataset( args, word2id) if args.entity_mask: mask_vectors = torch.randn( len(train_dataset.ner_tags) * 2, embedding_vectors.size(1)) embedding_vectors = torch.cat([embedding_vectors, mask_vectors], dim=0) label_weights = train_dataset.label_weights if args.label_weights else None model = CNNForRE(args, embedding_vectors, pad_id=train_dataset.pad_id, num_labels=train_dataset.num_labels, label_weights=label_weights) do_train = True if os.path.exists(f"{args.output}/pytorch_model.bin"): model.load_state_dict( torch.load(f"{args.output}/pytorch_model.bin", map_location="cpu")) do_train = False model.to(args.device) preds, scores = {}, {} model, scores["train"], scores["dev"], preds["train"], preds[ "dev"], best_epoch = train(args, train_dataset, dev_dataset, model, do_train) test_score = None if args.do_eval: scores["test"], preds["test"] = eval(args, test_dataset, model) print( f"|{'TEST':<7}|{scores['test']['precision']:>6.2f}|{scores['test']['recall']:>6.2f}|{scores['test']['f1']:>6.2f}|" ) model.to("cpu") if args.output is not None: os.makedirs(f"{args.output}/predictions", exist_ok=True) save_model(args.output, model) save_args(args.output, args) save_preds(f"{args.output}/predictions", preds) save_json(f"{args.output}/scores.json", scores) return model, scores, best_epoch
def do_train(config): train, dev, test = load_data(config) # 加载数据 word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config) # 创建或读取maps # 配置信息及保存 config["num_chars"] = len(word_to_id) # 词总数 config["num_tags"] = len(tag_to_id) # 标签总数 with open(config["config_file"], "w") as f: json.dump(config, f, ensure_ascii=False, indent=4) # 数据处理 train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"]) dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"]) test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"]) print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data))) # 分batch train_manager = BatchManager(train_data, config["batch_size"]) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) steps_per_epoch = train_manager.len_data # 每个轮次的steps # 创建相关路径 make_path(config) # logger logger = get_logger(config["log_file"]) # GPU限制 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # 创建模型, 可以提供使用现有参数配置 model = Model(config) ckpt = tf.train.get_checkpoint_state(config["ckpt_path"]) # 从模型路径获取ckpt if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): # 现有模型 logger.info("读取现有模型...") model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("新建模型...") sess.run(tf.global_variables_initializer()) # 不使用预训练的embeddings # 如果使用预训练的embeddings if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("开始训练...") loss = [] for i in range(config["max_epoch"]): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % config["steps_check"] == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config) if best: save_model(sess, model, config["ckpt_path"], logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)
# build indexes if not os.path.isfile(indexes_pkl) or not os.path.isfile(indexes_size_pkl): log.info("build indexes") indexes = {} indexes_size = {} else: log.info("previous indexes ({})".format(indexes_pkl)) indexes = load_from_pkl(indexes_pkl) indexes_size = load_from_pkl(indexes_size_pkl) indexes['words2id'], indexes_size['words2id'] = build_index(train['words']) indexes['rel_senses2id'], indexes_size['rel_senses2id'] = build_index(train['rel_senses']) log.info(" " + ", ".join([ "{}: {}".format(k, v) for k, v in indexes_size.items() ])) save_to_pkl(indexes_pkl, indexes) save_to_pkl(indexes_size_pkl, indexes_size) init_weights = load_word2vec(indexes['words2id'], indexes_size['words2id'], words_dim, words2vec_bin, words2vec_txt) # build model log.info("build model") words2id_size = indexes_size['words2id'] rel_senses2id_size = indexes_size['rel_senses2id'] shared_emb = Embedding(input_dim=words2id_size, output_dim=words_dim, weights=init_weights, dropout=words_dropout, mask_zero=True, name="shared_emb") # input: arg1 word/token ids arg1_ids = Input(shape=(arg1_len,), dtype='int32', name="arg1_ids") # shape: (sample, arg1_len) of words2id_size # input: arg2 word/token ids arg2_ids = Input(shape=(arg2_len,), dtype='int32', name="arg2_ids") # shape: (sample, arg2_len) of words2id_size
def train(X_train,X_dev,X_test): # load data sets train_sentences = X_train dev_sentences = X_dev test_sentences = X_test train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros) dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros) test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros) train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros) dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros) test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros) train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros) dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros) test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(train_sentences_loc, FLAGS.tag_schema) update_tag_scheme(test_sentences_loc, FLAGS.tag_schema) update_tag_scheme(train_sentences_per, FLAGS.tag_schema) update_tag_scheme(test_sentences_per, FLAGS.tag_schema) update_tag_scheme(train_sentences_org, FLAGS.tag_schema) update_tag_scheme(test_sentences_org, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0] dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained( dico_chars_train_loc.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_loc]) ) ) dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0] dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained( dico_chars_train_per.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_per]) ) ) dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0] dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained( dico_chars_train_org.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_org]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower) _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower) _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc) _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per) _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data),len(dev_data), len(test_data))) train_data_loc = prepare_dataset_ner( train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) dev_data_loc = prepare_dataset_ner( dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) test_data_loc = prepare_dataset_ner( test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) print("%i / %i / %i sentences_loc in train / dev / test." % ( len(train_data_loc), len(dev_data_loc), len(test_data_loc))) train_data_per = prepare_dataset_ner( train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) dev_data_per = prepare_dataset_ner( dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) test_data_per = prepare_dataset_ner( test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) print("%i / %i / %i sentences_per in train / dev / test." % ( len(train_data_per), len(dev_data_per), len(test_data_per))) train_data_org = prepare_dataset_ner( train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) dev_data_org = prepare_dataset_ner( dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) test_data_org = prepare_dataset_ner( test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) print("%i / %i / %i sentences_org in train / dev / test." % ( len(train_data_org), len(dev_data_org), len(test_data_org))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size) train_manager_per = BatchManager(train_data_per, FLAGS.batch_size) train_manager_org = BatchManager(train_data_org, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data steps_per_epoch_loc = train_manager_loc.len_data steps_per_epoch_per = train_manager_per.len_data steps_per_epoch_org = train_manager_org.len_data model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger) with tf.Session(config=tf_config, graph = model.graph ) as sess: sess.run(tf.global_variables_initializer()) if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights_ner = sess.run(model.char_lookup.read_value()) emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"], emb_weights, emb_weights_ner) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("start training") loss = [] loss_loc = [] loss_per = [] loss_org = [] for i in range(100): for batch_loc in train_manager_loc.iter_batch(shuffle=True): step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc) loss_loc.append(batch_loss_loc) if step_loc % FLAGS.steps_check == 0: iteration_loc = step_loc // steps_per_epoch_loc + 1 logger.info("iteration:{} step_loc:{}/{}, " "NER loss:{:>9.6f}".format( iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc))) loss_loc = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_1 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_loc_dev = model.precision(sess, dev_manager, id_to_tag) precision_loc_test = model.precision(sess, test_manager, id_to_tag) for batch_per in train_manager_per.iter_batch(shuffle=True): step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per) loss_per.append(batch_loss_per) if step_per % FLAGS.steps_check == 0: iteration_per = step_per // steps_per_epoch_per + 1 logger.info("iteration:{} step_per:{}/{}, " "NER loss:{:>9.6f}".format( iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per))) loss_per = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_2 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_per_dev = model.precision(sess, dev_manager, id_to_tag) precision_per_test = model.precision(sess, test_manager, id_to_tag) for batch_org in train_manager_org.iter_batch(shuffle=True): step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org) loss_org.append(batch_loss_org) if step_org % FLAGS.steps_check == 0: iteration_org = step_org // steps_per_epoch_org + 1 logger.info("iteration:{} step_org:{}/{}, " "NER loss:{:>9.6f}".format( iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org))) loss_org = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_3 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_org_dev = model.precision(sess, dev_manager, id_to_tag) precision_org_test = model.precision(sess, test_manager, id_to_tag) best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger) with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re: st_re.write(str(results).replace("[", "").replace("]", "")) st_re.write("\n")
def train(): # load data sets train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros) dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros) test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros) # appoint tagging scheme (IOB/IOBES) train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema) dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema) test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema) #create maps if not exist if not os.path.exists(FLAGS.map_file): if FLAGS.pre_emb: char_to_id,_=char_mapping(train_sentences) char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8') else: char_to_id, id_to_char=char_mapping(train_sentences) tag_to_id, id_to_tag=tag_mapping(train_sentences) with open(FLAGS.map_file,'wb') as f: cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL) else: with open(FLAGS.map_file,'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f) # prepare data, get a collection of list containing index train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True) dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True) test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True) print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data)) if not FLAGS.pre_emb: pre_emb=None else: pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim) print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1]) train_manager=BatchManager(train_data,FLAGS.batch_size,True) dev_manager=BatchManager(dev_data,FLAGS.batch_size,False) test_manager=BatchManager(test_data,FLAGS.batch_size,False) config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4) tfConfig = tf.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage with tf.Session(config=tfConfig) as sess: print "Train started!" model=BasicModel(config,pre_emb) saver=tf.train.Saver() # tensorboard if not os.path.exists(FLAGS.summaries_dir): os.mkdir(FLAGS.summaries_dir) merged=tf.summary.merge_all() train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph) test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph) # load previous trained model or create a new model if not os.path.exists(FLAGS.checkpoints): os.mkdir(FLAGS.checkpoints) model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name) ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints) if ckpt and ckpt.model_checkpoint_path: print "restore from previous traied model: %s" % FLAGS.model_name saver.restore(sess,ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) def evaluate(sess,model,manager): strings=[] predicts=[] goldens=[] bar = ProgressBar(max_value=manager.num_batch) for batch in bar(manager.iter_batch()): batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch) strings.extend(batch_string) predicts.extend(batch_predict) goldens.extend(batch_golden) return strings,predicts,goldens best_eval_f1=0 noimpro_num=0 for i in range(FLAGS.max_epoch): #train train_loss=[] bar = ProgressBar(max_value=train_manager.num_batch) for step,batch in bar(enumerate(train_manager.iter_batch())): batch.append(merged) summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep) #add summary to tensorboard train_writer.add_summary(summary,global_step) train_loss.append(batch_loss) print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss)) #dev strings,predicts,goldens=evaluate(sess,model,dev_manager) eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev') if eval_f1>best_eval_f1: best_eval_f1=eval_f1 noimpro_num=0 saver.save(sess,model_name) else: noimpro_num+=1 print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1) #test strings,predicts,goldens=evaluate(sess,model,test_manager) test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True) #early_stop if noimpro_num>=3: print "Early stop! Final F1 scores on test data is :%.6f" % test_f1 break print
def __init__(self): # load word embedding glove = data_utils.load_glove(FLAGS.glove_file) word2vec = data_utils.load_word2vec(FLAGS.word2vec_file) merged_embed, self.vocab_size = self.merge_glove_word2vec( glove, word2vec) dim = len(merged_embed[0]) merged_embed.append([0. for _ in xrange(dim)]) # load doc embedding self.doc_embedding, doc_dim = data_utils.load_fastText_embed(\ FLAGS.fastText_doc_file, FLAGS.fastText_vector_file) self.zero_doc_key = self.doc_key([self.vocab_size], [self.vocab_size]) self.doc_embedding[self.zero_doc_key] = [0. for _ in xrange(doc_dim)] FLAGS.fc_units = map(int, FLAGS.fc_units.split(',')) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) ''' graph ''' print 'Initializing model graph...' with tf.variable_scope('inputs'): self.training = tf.placeholder(tf.bool, name='training') self.title = tf.placeholder( tf.int32, shape=[None, None], name='title') # [batch size, sequence length] self.content = tf.placeholder(tf.int32, shape=[None, None], name='content') self.title_length = tf.placeholder(tf.int32, shape=[None], \ name='title_length') self.content_length = tf.placeholder(tf.int32, shape=[None],\ name='content_length') self.prices = tf.placeholder(tf.float32, name='prices', \ shape=[None, None, 7]) self.price_length = tf.placeholder(tf.int32, shape=[None], \ name='price_length') self.docs = tf.placeholder(tf.float32, name='docs', \ shape=[None, None, doc_dim]) self.doc_length = tf.placeholder(tf.int32, shape=[None], \ name='doc_length') self.label = tf.placeholder(tf.int32, shape=[None, 2], name='label') with tf.variable_scope('birnn_embed'): self.word_embedding = tf.Variable(merged_embed, dtype=tf.float32, name='word_embedding_matrix') title_embed = self.embed_birnn(FLAGS.title_units, FLAGS.title_layers, self.title, self.title_length, scope='title_embed_birnn') content_embed = self.embed_birnn(FLAGS.content_units, FLAGS.content_layers, self.content, self.content_length, scope='content_embed_birnn') price_embed = self.birnn(FLAGS.price_units, FLAGS.price_layers, self.prices, self.price_length, scope='price_birnn') doc_embed = self.birnn(FLAGS.doc_units, FLAGS.doc_layers, self.docs, self.doc_length, scope='doc_birnn') final_embed = tf.concat( [title_embed, content_embed, doc_embed, price_embed], 1) with tf.variable_scope('full_connect'): fc_inputs = final_embed for i in range(FLAGS.fc_layers): with tf.variable_scope('full_connect_layer_%d' % i): fc_outputs = tf.contrib.layers.legacy_fully_connected( fc_inputs, FLAGS.fc_units[i], activation_fn=tf.nn.relu, weight_regularizer=tf.contrib.layers.l2_regularizer( FLAGS.l2_coef)) fc_inputs = fc_outputs with tf.variable_scope('dropout'): dropout = tf.layers.dropout(fc_outputs, training=self.training) with tf.variable_scope('output'): W = tf.get_variable('W', shape=[FLAGS.fc_units[-1], 2], initializer=tf.truncated_normal_initializer()) biases = tf.get_variable( 'biases', shape=[2], initializer=tf.random_normal_initializer()) logits = tf.matmul(dropout, W) + biases self.result = tf.nn.softmax(logits) with tf.variable_scope('train'): self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=logits)) self.learning_rate = tf.Variable(FLAGS.init_lr, trainable=False, name="learning_rate") self.lr_decay_op = self.learning_rate.assign(self.learning_rate * FLAGS.lr_decay) self.global_step = tf.Variable(0, trainable=False, name='global_step') self.train_op = tf.train.AdamOptimizer(FLAGS.init_lr) \ .minimize(self.cross_entropy, self.global_step) with tf.variable_scope('logs'): self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) self.log_writer = tf.summary.FileWriter( os.path.join(FLAGS.train_dir, 'logs/'), self.session.graph) self.summary = tf.Summary()