def do_save_inference_model(args): if args.use_cuda: dev_count = fluid.core.get_cuda_device_count() place = fluid.CUDAPlace(0) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() test_prog = fluid.default_main_program() startup_prog = fluid.default_startup_program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): infer_loader, probs, feed_target_names = create_model( args, num_labels=args.num_labels, is_prediction=True) test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) assert (args.init_checkpoint) if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, test_prog) fluid.io.save_inference_model( args.inference_model_dir, feeded_var_names=feed_target_names, target_vars=[probs], executor=exe, main_program=test_prog, model_filename="model.pdmodel", params_filename="params.pdparams") print("save inference model at %s" % (args.inference_model_dir))
def test_inference_model(args): if args.use_cuda: dev_count = fluid.core.get_cuda_device_count() place = fluid.CUDAPlace(0) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() exe = fluid.Executor(place) test_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): infer_pyreader, probs, feed_target_names = create_model( args, pyreader_name='infer_reader', num_labels=args.num_labels, is_prediction=True) test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) processor = reader.SentaProcessor(data_dir=args.data_dir, vocab_path=args.vocab_path, random_seed=args.random_seed, max_seq_len=args.max_seq_len) num_labels = len(processor.get_labels()) assert (args.inference_model_dir) infer_program, feed_names, fetch_targets = fluid.io.load_inference_model( dirname=args.inference_model_dir, executor=exe, model_filename="model.pdmodel", params_filename="params.pdparams") infer_data_generator = processor.data_generator( batch_size=args.batch_size/dev_count, phase="infer", epoch=1, shuffle=False) infer_pyreader.set_sample_list_generator(infer_data_generator) inference(exe, test_prog, infer_pyreader, [probs.name], "infer")
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument from tensorflow.python.estimator.model_fn import EstimatorSpec tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if init_checkpoint: print("?????????????????????????????????????????????????????", init_checkpoint) (assignment_map, initialized_variable_names) \ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for var in tvars: init_string = "" if var.name in initialized_variable_names: if len(var.shape) == 2: print("var.shape=2", var.name, sess.run(var)[0][1]) else: print("var.shape=1", var.name, sess.run(var)[1]) init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = EstimatorSpec(mode=mode, predictions=probabilities) return output_spec
def test_inference_model(args, texts): if args.use_cuda: dev_count = fluid.core.get_cuda_device_count() place = fluid.CUDAPlace(0) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() test_prog = fluid.default_main_program() startup_prog = fluid.default_startup_program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): infer_pyreader, probs, feed_target_names = create_model( args, pyreader_name='infer_reader', num_labels=args.num_labels, is_prediction=True) test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(place) exe.run(startup_prog) assert (args.inference_model_dir) infer_program, feed_names, fetch_targets = fluid.io.load_inference_model( dirname=args.inference_model_dir, executor=exe, model_filename="model.pdmodel", params_filename="params.pdparams") data = [] for query in texts: wids = utils.query2ids(args.vocab_path, query) data.append(wids) data_shape = [[len(w) for w in data]] pred = exe.run( infer_program, feed={feed_names[0]: fluid.create_lod_tensor(data, data_shape, place)}, fetch_list=fetch_targets, return_numpy=True) for probs in pred[0]: print("%d\t%f\t%f\t%f" % (np.argmax(probs), probs[0], probs[1], probs[2]))
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" # tf.logging.info("*** Features ***") # for name in sorted(features.keys()): # tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) output_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=probabilities) return output_spec
def main(data_name, out_file, model_path, step=1, config_file="./bert_config_1.json", vocab_file="./vocab/vocab_1kmer.txt"): tf.logging.set_verbosity(tf.logging.INFO) batch_size = 32 use_tpu = False seq_length = 128 # vocab_file = "./vocab/vocab_2kmer.txt" init_checkpoint = model_path bert_config = modeling.BertConfig.from_json_file(config_file) learning_rate = 2e-5 num_train_steps = 100 num_warmup_steps = 10 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.75 samples_num = fasta2record(data_name, "predict.tf_record", vocab_file, step=step) batch_num = math.ceil(samples_num / batch_size) input_file = "predict.tf_record" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) input_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128)) input_mask = tf.placeholder(dtype=tf.int32, shape=(None, 128)) segment_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128)) label_ids = tf.placeholder(dtype=tf.int32, shape=(None, )) is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) num_labels = 2 use_one_hot_embeddings = False is_training = True (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) name_to_features = { "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), "is_real_example": tf.FixedLenFeature([], tf.int64), } drop_remainder = False def _decode_record(record, name_to_features): example = tf.parse_single_example(record, name_to_features) for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def input_fn(params): batch_size = params["batch_size"] d = tf.data.TFRecordDataset(input_file) d = d.apply( tf.contrib.data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, )) return d predict_data = input_fn({"batch_size": batch_size}) iterator = predict_data.make_one_shot_iterator().get_next() all_prob = [] with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) for _ in range(batch_num): examples = sess.run(iterator) prob = \ sess.run(probabilities, feed_dict={input_ids: examples["input_ids"], input_mask: examples["input_mask"], segment_ids: examples["segment_ids"], label_ids: examples["label_ids"]}) all_prob.extend(prob[:, 1].tolist()) # print(all_prob) # print(len(all_prob)) with open(data_name) as f: lines = f.readlines() with open(out_file, "w") as f: index = 0 for line in lines: if line[0] == ">": f.write(line) else: f.write(line.strip() + " " + str(all_prob[index]) + "\n") index += 1
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" global initialized_variable_names tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] print(features) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assigment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # tf.logging.info("**** Trainable Variables ****") # residue = [] # for var in tvars: # init_string = "" # if var.name in initialized_variable_names: # init_string = ", *INIT_FROM_CKPT*" # else: # residue.append(var) # tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, # init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: # def metric_fn(_per_example_loss, _label_ids, _logits): # predictions = tf.argmax(_logits, axis=-1, output_type=tf.int32) # accuracy = tf.metrics.accuracy(_label_ids, predictions) # # loss = tf.metrics.mean(_per_example_loss) # return { # "eval_accuracy": accuracy, # "eval_loss": loss, # } # # eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) output_spec = tpu.TPUEstimatorSpec(mode=mode, predictions=logits, loss=total_loss, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def main(): # The following are the input parameters. # When changing the dictionary, please modify the value of vocab_size in the file bert_config.json do_eval = FLAGS.do_eval do_save_model = FLAGS.do_save_model data_name = FLAGS.data_name # Record the number of samples in each data set train_dict = { "AMPScan": 2132, "BiLSTM": 4174, "iAMP": 1758, "MAMPs": 5234, "fold": 2928, "all_data": 8978, } test_dict = { "AMPScan": 1424, "BiLSTM": 1156, "iAMP": 1839, "MAMPs": 1666, "fold": 2119, "all_data": 8978, } tf.logging.set_verbosity(tf.logging.INFO) train_example_num = train_dict[data_name] test_example_num = test_dict[data_name] batch_size = FLAGS.batch_size # If the GPU memory is not enough, you can consider reducing it train_batch_num = math.ceil(train_example_num / batch_size) test_batch_num = math.ceil(test_example_num / batch_size) num_train_epochs = FLAGS.num_train_epochs warmup_proportion = FLAGS.warmup_proportion learning_rate = FLAGS.learning_rate use_tpu = FLAGS.using_tpu seq_length = FLAGS.seq_length data_root = FLAGS.data_root vocab_file = FLAGS.vocab_file init_checkpoint = FLAGS.init_checkpoint bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.75 # Prevent directly occupying all GPU # Enter the training set, this file is generated using ljy_tsv2record input_file = data_root + "/train.tf_record" tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) num_train_steps = int(train_example_num / batch_size * num_train_epochs) num_warmup_steps = int(num_train_steps * warmup_proportion) input_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128)) input_mask = tf.placeholder(dtype=tf.int32, shape=(None, 128)) segment_ids = tf.placeholder(dtype=tf.int32, shape=(None, 128)) label_ids = tf.placeholder( dtype=tf.int32, shape=(None, )) # Leave four placeholders for entering data and labels is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = True num_labels = 2 use_one_hot_embeddings = False (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) name_to_features = { "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), "is_real_example": tf.FixedLenFeature([], tf.int64), } drop_remainder = False def _decode_record(record, name_to_features): example = tf.parse_single_example(record, name_to_features) for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def input_fn(params): batch_size = params["batch_size"] d = tf.data.TFRecordDataset(input_file) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.apply( tf.contrib.data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, )) return d train_data = input_fn({"batch_size": batch_size}) # Generate the training set data iterator, the iterator will output data in the loop iterator = train_data.make_one_shot_iterator().get_next() if do_eval: input_file = data_root + "/dev.tf_record" dev_data = input_fn({"batch_size": batch_size}) dev_iterator = dev_data.make_one_shot_iterator().get_next() val_accs = [] sps = [] sns = [] if do_save_model: saver = tf.train.Saver() with tf.Session(config=config) as sess: init = tf.global_variables_initializer() sess.run(init) for step in range(num_train_epochs): start_time = time.time() for _ in range(train_batch_num): examples = sess.run( iterator) # Run iterator to generate samples # print(examples) _, loss = \ sess.run([train_op, total_loss], feed_dict={input_ids: examples["input_ids"], input_mask: examples["input_mask"], segment_ids: examples["segment_ids"], label_ids: examples["label_ids"]}) print("step:", step, " loss:", round(loss, 4), end=" ") all_prob = [] all_labels = [] all_pre_labels = [] if not do_eval: end_time = time.time() eta_time = (end_time - start_time) * \ (num_train_epochs - step - 1) print(" eta time:", eta_time, "s") continue for _ in range(test_batch_num): examples = sess.run(dev_iterator) loss, prob = \ sess.run([total_loss, probabilities], feed_dict={input_ids: examples["input_ids"], input_mask: examples["input_mask"], segment_ids: examples["segment_ids"], label_ids: examples["label_ids"]}) all_prob.extend(prob[:, 1].tolist()) all_labels.extend(examples["label_ids"].tolist()) pre_labels = np.argmax(prob, axis=-1).tolist() all_pre_labels.extend(pre_labels) acc = accuracy_score(all_labels, all_pre_labels) val_accs.append(acc) auc = roc_auc_score(all_labels, all_prob) mcc = matthews_corrcoef(all_labels, all_pre_labels) c_mat = confusion_matrix(all_labels, all_pre_labels) sn = c_mat[1, 1] / np.sum(c_mat[1, :]) sp = c_mat[0, 0] / np.sum(c_mat[0, :]) sps.append(sp) sns.append(sn) end_time = time.time() eta_time = (end_time - start_time) * (num_train_epochs - step - 1) print("SE:", sn, " SP:", sp, " ACC:", acc, " MCC:", mcc, " auROC:", auc, " eta time:", eta_time, "s") if do_save_model: save_path = saver.save(sess, FLAGS.save_path)
global graph graph = tf.get_default_graph() with graph.as_default(): print("going to restore checkpoint") is_training = False use_one_hot_embeddings = False batch_size = 1 num_labels = len(label_list) input_ids_p = tf.placeholder(tf.int32, [batch_size, None], name="input_ids") input_mask_p = tf.placeholder(tf.int32, [batch_size, None], name="input_mask") label_ids_p = tf.placeholder(tf.int32, [batch_size], name="label_ids") segment_ids_p = tf.placeholder(tf.int32, [None], name="segment_ids") total_loss, per_example_loss, logits, probabilities = create_model( bert_config, is_training, input_ids_p, input_mask_p, segment_ids_p, label_ids_p, num_labels, use_one_hot_embeddings) saver = tf.train.Saver() saver.restore( sess, tf.train.latest_checkpoint( os.path.join(input_model_checkpoint, 'model_output'))) graph_def = tf.get_default_graph().as_graph_def() # 1 # variables + saved_model.pb tf.saved_model.simple_save(sess, export_model_dir, inputs={ "input_ids": input_ids_p, "input_mask": input_mask_p,
#sess.run(tf.global_variables_initializer()) #max_seq_length input_ids_p = tf.placeholder(tf.int32, [1, config.max_seq_length], name="input_ids") input_mask_p = tf.placeholder(tf.int32, [1, config.max_seq_length], name="input_mask") bert_config = modeling.BertConfig.from_json_file( os.path.join(bert_dir, 'bert_config.json')) # #def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, # labels, num_labels, use_one_hot_embeddings): (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config=bert_config, is_training=False, input_ids=input_ids_p, input_mask=input_mask_p, segment_ids=None, labels=None, num_labels=len(labels), use_one_hot_embeddings=False) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(model_dir)) tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_dir, 'vocab.txt'), do_lower_case=True) @app.route('/class_predict_service', methods=['GET', 'POST']) def class_predict_service():