def run_infer(config, loaded_infer_model, infer_sess, pred_file): logger.info(" inference to output %s." % pred_file) infer_data = data_helper.load_data(config.infer_file, config.word_vocab_file, config.char_vocab_file, w_max_len1=config.max_word_len1, w_max_len2=config.max_word_len2, c_max_len1=config.max_char_len1, c_max_len2=config.max_char_len2, text_split="|", split="\t", mode="infer") infer_iterator = data_helper.batch_iterator(infer_data, batch_size=config.infer_batch_size, shuffle=False, mode="infer") pred_labels = [] lines = open(config.infer_file, "r", encoding="utf-8").readlines() with open(pred_file, mode="w", encoding="utf-8") as pred_f: pred_f.write("") while True: try: batch = next(infer_iterator) b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2 = batch pred = loaded_infer_model.infer(infer_sess, b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2) pred_labels.extend(pred) except StopIteration: logger.info(" Done inference.") break for line, p in zip(lines, pred_labels): res = line.strip() + "\t" + str(p) + "\n" pred_f.write(res)
def run_test(config, infer_model, infer_sess, data_file, model_dir): output_file = "output_" + os.path.split(data_file)[-1].split(".")[0] pred_file = os.path.join(model_dir, output_file) logger.info(" predictions to output %s." % pred_file) with infer_model.graph.as_default(): loaded_infer_model, global_step = model_helper.create_or_load_model( infer_model.model, model_dir, infer_sess, "infer") # running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") # running_vars_initializer = tf.variables_initializer(var_list=running_vars) # TODO: tf.metrics # infer_sess.run(running_vars_initializer) infer_sess.run(tf.local_variables_initializer()) infer_data = data_helper.load_data(data_file, config.word_vocab_file, config.char_vocab_file, w_max_len1=config.max_word_len1, w_max_len2=config.max_word_len2, c_max_len1=config.max_char_len1, c_max_len2=config.max_char_len2, text_split="|", split="\t", mode="infer") infer_iterator = data_helper.batch_iterator(infer_data, batch_size=config.batch_size, shuffle=False, mode="infer") start_time = time.time() step = 0 pred_labels = [] lines = open(data_file, "r", encoding="utf-8").readlines() with open(pred_file, mode="w", encoding="utf-8") as pred_f: pred_f.write("") while True: try: b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2 = next( infer_iterator) pred = loaded_infer_model.infer(infer_sess, b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2) pred_labels.extend(pred) step += 1 except StopIteration: break end_time = time.time() for line, p in zip(lines, pred_labels): res = line.strip() + "\t" + str(p) + "\n" pred_f.write(res) step_time = (end_time - start_time) / step logger.info("# predict step time %.4fs" % step_time)
def test(): print("process the image to h5file.....") test_dir = flags.test_dir test_h5_dir = flags.test_h5_dir stride = flags.test_stride if not os.path.exists(test_h5_dir): os.makedirs(test_h5_dir) test_set5 = os.path.join(test_dir, 'Set5') test_set14 = os.path.join(test_dir, 'Set14') path_set5 = os.path.join(test_h5_dir, 'Set5') path_set14 = os.path.join(test_h5_dir, 'Set14') data_helper.gen_input_image(test_set5, path_set5, stride) data_helper.gen_input_image(test_set14, path_set14, stride) print("initialize the model......") model_dir = flags.model_dir model = SRCNN(flags) model.build_graph() saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(model.sess, ckpt.model_checkpoint_path) else: print("model info didn't exist!") raise ValueError print("test in Set5......") test_h5_path = os.path.join(path_set5, "data.h5") data_set5, label_set5 = data_helper.load_data(test_h5_path) accu = model.test(data_set5, label_set5) print("the accuracy in Set5 is %.5f", accu) print("test in Set14......") test_h5_path = os.path.join(path_set14, "data.h5") data_set14, label_set14 = data_helper.load_data(test_h5_path) accu2 = model.test(data_set14, label_set14) print("the accuracy in Set14 is %.5f", accu2)
def train(): print("process the image to h5file.....") data_dir = flags.data_dir h5_dir = flags.h5_dir stride = flags.train_stride data_helper.gen_input_image(data_dir, h5_dir, stride) print("reading data......") h5_path = os.path.join(h5_dir, "data.h5") data, label = data_helper.load_data(h5_path) print("initialize the model......") model = SRCNN(flags) model.build_graph() model.train(data, label)
def mode_evaluate(config, input_path): """ 执行eval模式。评估模型。 :param config: 配置文件 :param input_path: 数据集路径 :return: 无 """ # 读入数据 x_test, y_test = load_data( os.path.join(input_path, "data_test.txt"), sample_ratio=config.data_sample_ratio, n_class=config.n_class, one_hot=config.one_hot, ) print("成功载入测试集文件") # 读取已有字典 my_vocab = load_vocabulary(max_vocab_size=config.max_vocab_size) config.vocab_size = my_vocab.vocab_size print("载入已有字典, 字典实际大小:{} , 字典设置大小: {}".format( len(my_vocab.word_index) + 1, config.vocab_size )) # 数据预处理(转化为id表示,并padding) x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len) print("Test Set size: %d" % len(x_test)) config.keep_prob = 1.0 # 创建分类器 classifier = choose_model_by_name(config) classifier.build_graph() # 创建测试集的batcher test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size) # 开始评估模型 evaluate(classifier, config, test_batcher)
tf.app.flags.DEFINE_string('data_path', '../text_data/input_data/', 'input data path') # Model params tf.app.flags.DEFINE_string("filter_sizes", "2,3,4", "textcnn model, convolution filter sizes") tf.app.flags.DEFINE_integer("num_filters", 2, "textcnn model, convolution filter nums") tf.app.flags.DEFINE_integer("num_classes", 2, "num_classes") tf.app.flags.DEFINE_float("keep_prob", 0.5, "Dropout keep probability (default: 0.5)") tf.app.flags.DEFINE_integer("hidden_num", 2, "Number of RNNCell num") tf.app.flags.DEFINE_integer("hidden_size", 2, "Number of RNN layers") # Training params tf.app.flags.DEFINE_float("learning_rate", 0.01, "learning_rate (default: 0.01)") tf.app.flags.DEFINE_integer("epochs", 10, "Number of training epochs (default: 10)") tf.app.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)") tf.app.flags.DEFINE_integer("checkpoint_every", 100, "Save model every steps (default: 100)") tf.app.flags.DEFINE_string("checkpoint_dir", './model_save/', "checkpoint_dir") train_x, train_y, valid_x, valid_y, embedding, word2index, index2word, vocab_size, maxlen = data_helper.load_data('../text_data/input_data/') print(train_x.shape) print(vocab_size) print(embedding.shape) print(embedding.dtype) print(maxlen) # model = FastText( # num_classes=FLAGS.num_classes, # sequence_length=maxlen, # w2v_model_embedding=embedding, # vocab_size=vocab_size, # embedding_size=200) # model = TextCNN(filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
# './data/vaccine/vaccine_month_sample.tsv', # './data/vaccine/vaccine_year_sample.tsv', # './data/parties/parties_year_sample.tsv', # './data/aware/aware_month_sample.tsv', # './data/economy/economy_rel_month_sample.tsv', # './data/economy/economy_rel_year_sample.tsv', './data/amazon/amazon_review_month_sample.tsv', './data/amazon/amazon_review_year_sample.tsv', './data/yelp/yelp_Hotels_month_sample.tsv', './data/yelp/yelp_Hotels_year_sample.tsv', './data/yelp/yelp_Restaurants_month_sample.tsv', './data/yelp/yelp_Restaurants_year_sample.tsv', ] for data_path in file_list: dataset = data_helper.load_data(data_path) paths = data_path.split('/') paths[1] = 'features' paths[-1] = paths[-1][:-11] outp = '/'.join(paths) for ftype in ['tfidf', 'binary']: tmp_path = outp + '_' + ftype + '.pkl' tmp_path = data_helper.train_fvs_da(dataset, outputfile=outp, balance=False, fea_type=ftype) #fvs_file = pickle.load(open(tmp_path, 'rb')) print(tmp_path) #for balance in [True, False]: # print('\t----------------Balance: ' + str(balance) + '---------------')
def train(config, model_creator): steps_per_stats = config.steps_per_stats steps_per_eval = config.steps_per_eval model_dir = config.model_dir log_dir = config.log_dir ckpt_name = config.ckpt_name ckpt_path = os.path.join(model_dir, ckpt_name) # Create model train_model = model_helper.create_model(model_creator, config, "train") eval_model = model_helper.create_model(model_creator, config, "eval") # infer_model = model_helper.create_model(model_creator, config, "infer") train_data = data_helper.load_data(config.train_file, config.word_vocab_file, config.char_vocab_file, w_max_len1=config.max_word_len1, w_max_len2=config.max_word_len2, c_max_len1=config.max_char_len1, c_max_len2=config.max_char_len2, text_split="|", split="\t") train_iterator = data_helper.batch_iterator(train_data, batch_size=config.batch_size, shuffle=True) eval_data = data_helper.load_data(config.dev_file, config.word_vocab_file, config.char_vocab_file, w_max_len1=config.max_word_len1, w_max_len2=config.max_word_len2, c_max_len1=config.max_char_len1, c_max_len2=config.max_char_len2, text_split="|", split="\t") # eval_iterator = data_helper.batch_iterator(eval_data, batch_size=config.batch_size, shuffle=False) # TensorFlow model session_config = utils.get_config_proto() train_sess = tf.Session(config=session_config, graph=train_model.graph) eval_sess = tf.Session(config=session_config, graph=eval_model.graph) # infer_sess = tf.Session(config=config, graph=infer_model.graph) # Summary Writer train_summary_writer = tf.summary.FileWriter( os.path.join(log_dir, "train_log"), train_model.graph) eval_summary_writer = tf.summary.FileWriter( os.path.join(log_dir, "eval_log"), eval_model.graph) with train_model.graph.as_default(): loaded_train_model, global_step = model_helper.create_or_load_model( train_model.model, model_dir, train_sess, "train") local_initializer = tf.local_variables_initializer() # running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") # running_vars_initializer = tf.variables_initializer(var_list=running_vars) step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, gN = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 lr = loaded_train_model.learning_rate.eval(session=train_sess) last_stat_step = global_step last_eval_step = global_step logger.info("# Start step %d" % global_step) epoch_idx = 0 while epoch_idx < config.num_train_epochs: start_time = time.time() try: # TODO: tf.metrics # train_sess.run(running_vars_initializer) train_sess.run(local_initializer) batch = next(train_iterator) b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2, b_labels = batch # for b in batch: # print(b) train_summary1, pred, step_loss, _, acc_op, rec_op, pre_op, auc_op, global_step, grad_norm, lr = \ loaded_train_model.train(train_sess, b_word_ids1, b_word_ids2, b_word_len1, b_word_len2, b_char_ids1, b_char_ids2, b_char_len1, b_char_len2, b_labels) train_summary2, step_acc, step_rec, step_pre, step_auc = \ train_sess.run([loaded_train_model.train_summary2, loaded_train_model.accuracy, loaded_train_model.recall, loaded_train_model.precision, loaded_train_model.auc]) config.epoch_step += 1 except StopIteration: # Finished going through the training dataset. Go to next epoch. epoch_idx += 1 config.epoch_step = 0 train_iterator = data_helper.batch_iterator( train_data, batch_size=config.batch_size, shuffle=True) continue step_time += (time.time() - start_time) train_loss += step_loss train_acc += step_acc train_rec += step_rec train_pre += step_pre train_auc += step_auc gN += grad_norm if global_step - last_stat_step >= steps_per_stats: last_stat_step = global_step step_time /= steps_per_stats train_loss /= steps_per_stats train_acc /= steps_per_stats train_rec /= steps_per_stats train_pre /= steps_per_stats train_f1 = (2 * train_rec * train_pre) / (train_rec + train_pre + 0.00000001) gN /= steps_per_stats logger.info( " step %d lr %g step_time %.2fs loss %.4f acc %.4f rec %.4f pre %.4f f1 %.4f auc %.4f gN %.2f" % (global_step, lr, step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, grad_norm)) train_summary_writer.add_summary(train_summary1, global_step=global_step) train_summary_writer.add_summary(train_summary2, global_step=global_step) step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, gN = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step # Save checkpoint loaded_train_model.saver.save(train_sess, ckpt_path, global_step=global_step) # Evaluate on dev run_eval(config, eval_model, eval_sess, eval_data, model_dir, ckpt_name, eval_summary_writer, save_on_best=True) logger.info("# Finished epoch %d, step %d." % (epoch_idx, global_step)) # Done training loaded_train_model.saver.save(train_sess, ckpt_path, global_step=global_step) logger.info( "# Final, step %d lr %g step_time %.2fs loss %.4f acc %.4f rec %.4f pre %.4f f1 %.4f auc %.4f gN %.2f" % (global_step, lr, step_time, train_loss, train_acc, train_rec, train_pre, train_f1, train_auc, gN)) logger.info("# Done training!") train_summary_writer.close() eval_summary_writer.close()
def train_step(config_disc, config_evl): print("loading the disc train set") config = config_disc eval_config = config_evl eval_config.keep_prob = 1.0 train_data, valid_data, test_data = data_helper.load_data( True, config.max_len, batch_size=config.batch_size) print("begin training") # gpu_config=tf.ConfigProto() # gpu_config.gpu_options.allow_growth=True with tf.Graph().as_default(), tf.Session() as session: print("model training") initializer = tf.random_uniform_initializer(-1 * config.init_scale, 1 * config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): #model = disc_rnn_model.disc_rnn_model(config=config,is_training=True) model = create_model(session, config, is_training=True) with tf.variable_scope("model", reuse=True, initializer=initializer): #valid_model = disc_rnn_model.disc_rnn_model(config=eval_config,is_training=False) #test_model = disc_rnn_model.disc_rnn_model(config=eval_config,is_training=False) valid_model = create_model(session, eval_config, is_training=False) test_model = create_model(session, eval_config, is_training=False) #add summary # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy]) train_summary_dir = os.path.join(config.out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph) # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy]) dev_summary_dir = os.path.join(eval_config.out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph) #add checkpoint checkpoint_dir = os.path.abspath( os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "disc.model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) #saver = tf.train.Saver(tf.all_variables()) tf.global_variables_initializer().run() global_steps = 1 begin_time = int(time.time()) for i in range(config.num_epoch): print("the %d epoch training..." % (i + 1)) lr_decay = config.lr_decay**max(i - config.max_decay_epoch, 0.0) model.assign_new_lr(session, config.lr * lr_decay) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data, config_disc.batch_size, train_summary_writer, dev_summary_writer) if i % config.checkpoint_every == 0: path = model.saver.save(session, checkpoint_prefix, global_steps) print("Saved model chechpoint to{}\n".format(path)) print("the train is finished") end_time = int(time.time()) print("training takes %d seconds already\n" % (end_time - begin_time)) test_accuracy = evaluate(test_model, session, test_data, config_disc.batch_size) print("the test data accuracy is %f" % test_accuracy) print("program end!")
def mode_train(config, input_path): """ 执行train模式。按照给定配置,训练模型。 :param config: 配置文件 :param input_path: 数据集路径 :return: 无 """ # 读入训练集和测试集 x_train, y_train = load_data( os.path.join(input_path, "data_train.txt"), sample_ratio=config.data_sample_ratio, n_class=config.n_class, one_hot=config.one_hot, ) print("成功载入训练集文件") x_test, y_test = load_data( os.path.join(input_path, "data_test.txt"), sample_ratio=config.data_sample_ratio, n_class=config.n_class, one_hot=config.one_hot, ) print("成功载入测试集文件") # 获取验证集 if os.path.isfile(os.path.join(input_path, "data_valid.txt")): # 从验证集文件中获取 x_valid, y_valid = load_data( os.path.join(input_path, "data_test.txt"), sample_ratio=config.data_sample_ratio, n_class=config.n_class, one_hot=config.one_hot, ) print("成功载入验证集文件") else: # 将测试集的一部分分割出来,作为验证集 split_radio = config.valid_test_split_radio # 设置分割比例 x_test, x_valid, y_test, y_valid = split_dataset( x_test, y_test, split_radio) print("没有发现验证集文件,已分割测试集的 {}% 来作为验证集".format(split_radio * 100)) # 创建字典 my_vocab = make_vocabulary(x_train, max_vocab_size=config.max_vocab_size) config.vocab_size = my_vocab.vocab_size print("使用训练集数据 制作字典完成, 字典实际大小:{} , 字典设置大小: {}".format( len(my_vocab.word_index) + 1, config.vocab_size)) # 数据预处理(转化为id表示,并padding) print('开始对数据集进行预处理 (word表示 -> id表示)') x_train = data_preprocessing(x_train, my_vocab, max_len=config.max_len) x_valid = data_preprocessing(x_valid, my_vocab, max_len=config.max_len) x_test = data_preprocessing(x_test, my_vocab, max_len=config.max_len) print("Train Set size: %d" % len(x_train)) print("Valid Set size: %d" % len(x_valid)) print("Test Set size: %d" % len(x_test)) # 创建分类器 classifier = choose_model_by_name(config) classifier.build_graph() # 创建训练集、验证集、测试集的 batcher train_batcher = Batcher(x_train, y_train, batch_size=config.batch_size) valid_batcher = Batcher(x_valid, y_valid, batch_size=config.batch_size) test_batcher = Batcher(x_test, y_test, batch_size=config.batch_size) # 开始训练模型 train(classifier, config, train_batcher, valid_batcher, test_batcher)
import numpy as np import matplotlib.pyplot as plt from utils.data_helper import QuerySimilarityProcessor, load_data # import pandas as pd # pd.options.display.max_columns = None processor = QuerySimilarityProcessor() label_list = processor.get_labels() num_labels = len(label_list) config = BertConfig.from_pretrained("../data/bert-base-chinese/config.json", num_labels=num_labels) model = BertForSequenceClassification.from_pretrained("../data/bert-base-chinese/") tokenizer = BertTokenizer.from_pretrained('../data/bert-base-chinese/vocab.txt') train_dataset = load_data("../data/ATEC/", processor, tokenizer) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32) valid_dataset = load_data("../data/ATEC/", processor, tokenizer,evaluate=True) valid_sampler = RandomSampler(train_dataset) valid_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32) config_class, model_class, tokenizer_class = [BertConfig, BertForSequenceClassification, BertTokenizer] no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ]
"how often to run a validation minibatch.") flags.DEFINE_integer('validate_batch_size', 128, "how many nodes per validation sample.") flags.DEFINE_integer('print_every', 10, "How often to print training info.") flags.DEFINE_integer('max_total_steps', 1000, "Maximum total number of iterations") flags.DEFINE_string('temporal_learner', 'LSTM', 'Which temporal learner to choose.') flags.DEFINE_integer('gpu', 0, "which gpu to use.") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ["CUDA_VISIBLE_DEVICES"] = str(FLAGS.gpu) # main procedure print("### loading training data...") train_data = load_data(FLAGS.train_prefix) graphs_with_views = train_data[0] #list,(10,4) features = train_data[1] #None id_map = train_data[2] #dict,35981 val_edges = train_data[4] #list,(399,2) context_pairs = train_data[3] if FLAGS.random_context else None # pad with dummy zero vector if not features is None: features = np.vstack([features, np.zeros((features.shape[1], ))]) # print(features) #None print("### Initializing minibatch iterator...") placeholders = construct_placeholders() minibatch = EdgeMinibatchIterator(graphs_with_views, id_map,
'News data - economy'), ('./data/yelp/yelp_Restaurants_month_sample.tsv', './data/yelp/yelp_Restaurants_year_sample.tsv', 'yelp_rest', 'Reviews data - restaurants' ), # './data/yelp/yelp_Restaurants_month_sample.tsv' ] for pair in file_list: print(pair) for is_binary in [False]: # True, skip binary currently # on month month_file = pair[0] year_file = pair[1] output = pair[2] if month_file: dataset = data_helper.load_data(month_file) # test on balanced data print('Test on balanced data') test_balance = cross_test_domain_clf(dataset, domain2month, data_name=None, balance=True, binary=is_binary) test_balance.to_csv('./tmp/' + output + '_month.tsv', sep='\t') viz_perform( test_balance, pair[3], './image/' + output + '/cross_clf_balance_month_' + str(is_binary) + '.pdf') test_balance = None # print('Test on unbalanced data') # test_unbalance = cross_test_domain_clf(dataset, domain2month, data_name=None, balance=False, binary=is_binary)