def predict_class(text): logging.info(f"Input text: {text}") logging.info("cleaning input text") text = clean_text(text) sentence = [text] tokenizer = load_tokenizer() logging.info("trained tokenizer loaded") word_index = tokenizer.word_index vocab_size = len(word_index) text_sequences = tokenizer.texts_to_sequences(sentence) text_padded = pad_sequences(text_sequences, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LENGTH) logging.info("creating embedding matrix using Glove Embeddings") embedding_matrix = embedding_matrix_glove(word_index) logging.info(f"Embeddings Weights created {embedding_matrix.shape}") logging.info("getting pre-trained model") model = create_model(vocab_size, EMBEDDING_DIM, MAX_LENGTH, embedding_matrix) print(model.summary()) logging.info("loading model weights") model.load_weights(MODEL) predict = model.predict(text_padded) predict = np.argmax(predict) predicted_main_product = get_main_product(predict) logging.info(f'Predicted Main Product: {predicted_main_product}')
def main(): parser = argparse.ArgumentParser(description='model') parser.add_argument('--input_image_paths', dest='input_image_paths', default='inputs_128_128_1', nargs='+', help='input image paths, separate by space') parser.add_argument('--max_input_count', dest='max_input_count', type=int, default=10000, help='max input image count for train') parser.add_argument('--model_parameter_path', dest='model_parameter_path', default='model_parameter') parser.add_argument('--dump_detail', dest='dump_detail', default=False, action='store_true') args = parser.parse_args() train_x, train_y, test_x, test_y = load_image_data(args.input_image_paths, args.max_input_count) model = create_model(0.0001) load_latest_model_parameter(model, args.model_parameter_path) test_model(model, train_x, train_y, args.dump_detail) test_model(model, test_x, test_y, args.dump_detail)
def evaluate_line(): config = utils.load_config(FLAGS.config_file) # 读取配置文件 log_path = os.path.join("evl_log", FLAGS.log_test) # ./log/train.log logger = utils.get_logger(log_path) # log文件名及路径 # limit GPU memory tf_config = tf.ConfigProto() # TensorFlow 会话的配置项 tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: # map_file 中存储着字与id,tag与id之间的对应关系 char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # char_to_id 每个字对应的id, id_to_char 两者是相对应的 # print('char_to_id: ', char_to_id) # print('tag_to_id: ', tag_to_id) with tf.Session(config=tf_config) as sess: model = utils.create_model(sess, Model, FLAGS.ckpt_path, data_utils.load_word2vec, config, id_to_char, logger) while True: try: line = input("请输入测试句子:") if line == 'exit': break result = model.evaluate_line( sess, data_utils.input_from_line(line, char_to_id), id_to_tag) print(result) logger.debug(result) except Exception as e: logger.info(e)
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger, False) # while True: # # try: # # line = input("请输入测试句子:") # # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # # print(result) # # except Exception as e: # # logger.info(e) # # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) line = u"香港的房价已经到达历史巅峰,乌溪沙地铁站上盖由新鸿基地产公司开发的银湖天峰,现在的尺价已经超过一万五千港币。" line = u"这是测试语句,国务院加入测试" result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def evaluate_line(): config = load_config(FLAGS.config_file) #从文件config_file 中读取配置数据 #{'model_type': 'idcnn', 'num_chars': 3538, 'char_dim': 100, 'num_tags': 51, 'seg_dim': 20, 'lstm_dim': 100, 'batch_size': 20, 'emb_file': 'E:\\pythonWork3.6.2\\NERuselocal\\NERuselocal\\data\\vec.txt', 'clip': 5, 'dropout_keep': 0.5, 'optimizer': 'adam', 'lr': 0.001, 'tag_schema': 'iobes', 'pre_emb': True, 'zeros': True, 'lower': False} logger = get_logger(FLAGS.log_file) #写日志文件名字为 train.log # limit GPU memory tf_config = tf.ConfigProto( ) #实例化一个设置GPU的对象 函数用在创建session的时候,用来对session进行参数配置 tf_config.gpu_options.allow_growth = True #1动态申请显存需要多少使用多少 2限制GPU使用率 config.gpu_options.per_process_gpu_memory_fraction = 0.4 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) while True: # try: # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) # except Exception as e: # logger.info(e) line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def main(): # load parameters for run... parameters = yaml.load(open(PARAM_FILE_DIRECTORY)) db_defs = parameters['database'] solver_param = parameters['solver_param'] model_param = parameters['model_param'] model_specific_params = parameters[model_param['model_type']] print('Initialize DB connection....') db, dictionary, reverse_dictionary = utils.initialize_db_connection( db_defs) #target_words, context, dictionary, reverse_dictionary = collect_data(db_defs,5000) #data, count, dictionary, reverse_dictionary = collect_data2(vocabulary_size=model_param['vocabulary_size']) #target_words, context = generate_batch(data, 500, 2, 2) #data, count, dictionary, reverse_dictionary = collect_data2(vocabulary_size=model_param['vocabulary_size']) print('Done Collect Data.') print('try to import model') # build model... model = create_model(model_param, model_specific_params) print('Model drawn') # Initialize the solver object. solver = Solver(model) # train model.... solver.train(db, db_defs, dictionary, reverse_dictionary, solver_param) #grab embeddings for some sample data. embedding = solver.run(np.array([1, 2, 3, 12], dtype=np.int32), solver_param) print('done!')
def train(self): self.get_sentences_dict() self.get_batch_data() logger, config = self.get_config() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory steps_per_epoch = self.train_batch_manager.len_data # 每一轮epoch的batch数量 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, self.id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in self.train_batch_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # 对验证集进行预测和评估 best = self.evaluate(sess, model, "dev", self.dev_batch_manager, self.id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger)
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # txt = input("请输入文件:") # with open(txt, encoding='u8') as test_file: # for line in test_file.readlines(): # line = line.split(',') # result = model.evaluate_line(sess, input_from_line(line[1], char_to_id), id_to_tag) # print(result) while True: line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def evaluate_file(file, target): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) with open(file, encoding='u8') as fr, open(target, mode='w', encoding='u8') as fw: for line in fr: result = model.evaluate_line( sess, input_from_line(line.strip(), char_to_id), id_to_tag) print(result) fw.write(json.dumps(result, ensure_ascii=False)) fw.write("\n") fw.flush() while True: line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) user_path = os.path.expanduser("~") with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) with open( user_path + '/share/deep_learning/data/knowledge_graph/entity_relation/corpus.yml' ) as f: corpus = yaml.load(f) for key in corpus: sentences = corpus[key] sentences = list(set(sentences)) for sen in sentences: sen_strip = sen.replace(' ', '') result = model.evaluate_line( sess, input_from_line(sen_strip, char_to_id), id_to_tag) extract_entity(result, key)
def main(_): if FLAGS.train: if FLAGS.clean: clean(FLAGS) train() else: # 下面使用testdata来进行评估模型 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) log_path = os.path.join("log", FLAGS.log_file) config = load_config(FLAGS.config_file) logger = get_logger(log_path) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) test_manager = BatchManager(test_data, 100) with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def evaluate_line(sents): global static_model config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory graph = tf.Graph() # tf_config = tf.ConfigProto() # tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # with tf.Session(config=tf_config) as sess: sess = tf.InteractiveSession(graph=graph) print(" start create model") static_model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) print(" end create model") result = static_model.evaluate_line(sess, input_from_line(sents, char_to_id), id_to_tag) sess.close() return result
def predict(): """ 对一个数据集进行实体识别 :return: """ config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory # 从训练阶段生成的map_file中恢复各映射字典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower, train=False) test_manager = BatchManager(test_data, 1) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger) logger.info("predict data......") ner_results = model.predict(sess, test_manager, id_to_tag) result_write_evaluate(ner_results, FLAGS.result_path, "test")
def evaluate_line(): # 写死 FLAGS.config_file = 'forum_config/config_file' FLAGS.log_file = 'forum_config/log/train.log' FLAGS.ckpt_path = 'forum_ckpt/' # FLAGS.ckpt_path = 'ckpt/' FLAGS.map_file = 'forum_config/maps.pkl' config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) while True: # try: # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) # except Exception as e: # logger.info(e) line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def __test(): """ test interface :return: """ log_path = os.path.join(".", config['log_path']) config['ckpt_path'] = "Pretrain/" + domain + "/" logger = get_logger(log_path) id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, train_entity_list, \ test_entity_list, valid_entity_list = data_reader(logger, config=config, domain=domain, entity_entity_topk=entity_knn_number) test_data = (entity_relation_Adj, entity_entity_sim_Matrix, test_entity_list, truth_label) test_manager = BatchManager(test_data, len(test_entity_list), "test") tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, config['ckpt_path'], config, logger) logger.info("start test") test_precision, test_ndcg, test_map, test_ndcg_topall = evaluate( sess, model, "test", test_manager, logger, id2mid, id2p) print("test precision at {} :{:>.5f}".format(k, test_precision)) print("test ndcg at {} :{:>.5f}".format(k, test_ndcg)) print("test map :{:>.5f}".format(test_map)) print("test ndcg_topall :{:>.5f}".format(test_ndcg_topall))
def evaluate_ht(): submit_path_ht = 'submit_sample/hetong.csv' submit_path_file = open(submit_path_ht, 'a+', encoding='gbk') submit_path_file.write('公告id,甲方,乙方,项目名称,合同名称,合同金额上限,合同金额下限,联合体成员\n') config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) rootdir = '/home/utopia/corpus/FDDC_part2_data/FDDC_announcements_round1_test_a_20180605/重大合同/html/' list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件 for i in range(0, len(list)): htmlpath = os.path.join(rootdir, list[i]) if os.path.isfile(htmlpath): print(htmlpath) s_arr = levelText_withtable(htmlpath) candidates = [] for j in range(len(s_arr)): sen = s_arr[j] result = model.evaluate_line( sess, input_from_line(sen, char_to_id), id_to_tag) entities = result.get('entities') if len(entities) > 0: for en in entities: en['sid'] = j en['pid'] = list[i] candidates.append(en) org_ht(candidates, submit_path_file) print('-------------------------------------------------')
def predict(): batcher = get_batcher() config = load_config(FLAGS.config_file) logger = get_logger(os.path.join('log', FLAGS.log_file)) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, tag_to_id, id_to_tag = pickle.load(f) def get_test_data(char2id): sentences = [] with open('data/test.txt', 'r', encoding='utf-8') as f: for line in f: words = line.strip().split('_') ids = [ char2id[char if char in char2id else '<UNK>'] for char in words ] sentences.append([words, ids]) return sentences test_data = get_test_data(char_to_id) with tf.Session(config=tf_config) as sess: elmo_model = load_elmo() model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger) results = model.predict_batch(sess, data=test_data, id_to_tag=id_to_tag, batcher=batcher, batch_size=FLAGS.batch_size) result_to_file(results)
def semisupervised_training(unsup, layers, data, epochs): X_train, y_train, X_test, y_test = data[0], data[1], data[2], data[3] sup = create_model(layers, 0, 0, 0, 'relu', 'softmax', 'binary_crossentropy', 'adam') for i in range(3): sup.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=200, verbose=2) unsup = change_weights(sup, unsup) unsup.fit(X_train, X_train, validation_data=(X_test, X_test), epochs=1, batch_size=200, verbose=2) sup = change_weights(unsup, sup) for j in range(epochs): sup.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=200, verbose=2) return sup
def content_ner(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) for i in range(302): newslist = [] m = i + 1 f = open('newsLists/%d.txt' % m, 'r') for line in f.readlines(): newslist.append(int(line)) fout = codecs.open('content_ner/%d.json' % m, 'w', encoding='utf-8') for i in range(len(newslist)): day = newslist[i] f = codecs.open('D:/PycharmProjects/news_data/%d.json' % day, encoding='utf-8') f_d = json.load(f) content = f_d["content"] content = strip_tags(content) result = model.evaluate_line( sess, input_from_line(content, char_to_id), id_to_tag) dicObj = json.dumps(result) fout.write(dicObj) fout.write("\n") fout.close()
def ensemble_val_data(): preds_raw = [] labels = [] for match_str in w_file_matcher: os.chdir(MODEL_DIR) w_files = glob.glob(match_str) for w_file in w_files: full_w_file = MODEL_DIR + '/' + w_file mname = w_file.split('_')[0] print(full_w_file) model = create_model(mname) model.load_state_dict(torch.load(full_w_file)) pred, y = make_preds_val(model) #pred = np.array(pred) preds_raw.append(pred) labels.append(y) del model save_array(PRED_VAL_RAW, preds_raw) preds = np.mean(preds_raw, axis=0) save_array(PRED_VAL, preds) save_array(VAL_LABELS, labels) return preds, labels
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # while True: # try: # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) # except Exception as e: # logger.info(e) # line = input("请输入测试句子:") line = "哮喘古代文献也称“鼻息”、“肩息”、“上气”等。" result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(line) print([(x["word"], x["type"]) for x in result["entities"]]) line = "喘病是指由于外感或内伤,导致肺失宣降,肺气上逆或气无所主,肾失摄纳,以致呼吸困难,甚则张口抬肩,鼻翼煽动,不能平卧等为主要临床特征的一种病证。 " result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(line) print([(x["word"], x["type"]) for x in result["entities"]])
def main(): # For GPU memory efficiency gpu_options = tf.GPUOptions(allow_growth=True) # create log dir if not os.path.exists(FLAGS.logdir): os.makedirs(FLAGS.logdir) model_path = os.path.join(FLAGS.logdir, FLAGS.task_name) # Train or Inference with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if FLAGS.is_train: # load traning dataset print ("Start Loding Dataset...") mnist = input_data.read_data_sets(FLAGS.data_path ,one_hot=False, validation_size=5000) train_data = mnist.train val_data = mnist.validation print ("Loading done.") print ("Initialize the network") if not os.path.exists(model_path) or FLAGS.reset: # Create model if not exist or reset the model. model = create_model(sess, FLAGS, mode="train") else: model = load_model(sess, model_path, mode="train") # then we restore the trained model. train(sess, train_data, val_data, model, FLAGS) print ("Training Done.") else: print ("The inference mode is removed from inference.py")
def get_slot_dl(text): """ 获得一个句子的slot_table :param text: 用户输入的句子 # :param tf_sess: tensorflow 的sessiong :return: """ tf.reset_default_graph() # FIXME: 为了可以保证重复多次输入(不过会很慢)整合代码时,可以尝试将NLU和DM分离,使用统一的文件管理。 FLAGS.config_file = '/forum_config/config_file' FLAGS.log_file = '/forum_config/log/train.log' FLAGS.ckpt_path = '/forum_ckpt/' FLAGS.map_file = '/forum_config/maps.pkl' file_path = os.path.dirname(__file__) config = load_config(file_path+FLAGS.config_file) logger = get_logger(file_path+FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(file_path+FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) result = model.evaluate_line(sess, input_from_line(text, char_to_id), id_to_tag) return result
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # map_file文件需要改 with open(FLAGS.map_file, "rb+") as f: # pkllf=pickle.load(f) char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) while True: # try: # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) # except Exception as e: # logger.info(e) # 改成file in file out line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def test_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start testing") data = input_pro() for line in data: data = json.loads(line) print(data) id = data["id"] doc_txt = data["doc"] golden_event = data["golden_event"] chunk_result = data["chunk_result"] result_entity = data["result"] for result in result_entity: str_token = result["string"] entities = result["entities"] test_data = input_from_line(str_token, char_to_id) classfy_result = model.evaluate_line(sess, test_data) print(classfy_result) result["mention_classify"] = classfy_result[0] write_to_file(data)
def main(): # read in the parameters defining the model. parameters = yaml.load(open(PARAM_FILE_DIRECTORY)) # read in all the parameters defining the model and # how it will be trained. solver_param = parameters['solver_param'] model_param = parameters['model_param'] model_specific_params = parameters[model_param['model_type']] print("Training data using %s model" % (model_param['model_type'])) print('Loading Data set...') data = utils.load_data() print('Drawing model...') # initialize the model model = utils.create_model(model_param, model_specific_params) # build it with feature, label sizes... model.build(data['features'], data['categories']) print('Model drawn.') # Initialize the solver object. solver = Solver(model) # train model.... solver.train(data['X_train'], data['y_train'], solver_param, data['X_val'], data['y_val']) print('done!')
def evaluate_predictsamples(): config = load_config(FLAGS.config_file) logger = get_logger('log/test.log') print_config(config, logger) # limit GPU memory logger.info("start predict") tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) index = 1 with open('data/predict.txt', 'w', encoding='utf-8') as f_write: read_file = 'data/test.txt' for line in open(read_file, 'r', encoding='utf-8-sig').readlines(): result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result) content = str(index) + ',' if result is not None: entities = result['entities'] for entity in entities: if entity: print(entity['word'] + '\t' + entity['type']) # content += entity['word'] + '\t' + str(entity['start']) + '\t' + str(entity['end']) + '\t' + entity['type']+ ';' content += entity['word'] + '\t' + entity[ 'type'] + ';' f_write.write(content + '\n') index += 1
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) f = codecs.open(os.path.join(FLAGS.test_filepath, "127_9.txt"), "r", "utf-8") s = f.read() line = [] sent = '' for i in range(len(s)): if s[i] != '。': sent += s[i] else: sent += s[i] line.append(sent) sent = '' line = input("请输入测试句子:") for info in line: print(info) result = model.evaluate_line(sess, input_from_line(info, char_to_id), id_to_tag) for info1 in result['entities']: print(info1)
def stacking(self): N, M = len(self.models), len(self.y_train) super_model = create_model([N, 500, 500, 1], 0.8, 1, 'elu', 'sigmoid', 'binary_crossentropy') predictions = np.zeros(shape=(M, N)) new_models = [] for i in range(N): model = self.models[i] indices = np.array(range(M)) sample = np.random.choice(indices, round(len(indices) * 0.7), replace=False) X, Y = self.X_train[sample], self.y_train[sample] model.fit(X, Y, batch_size=5000, verbose=1) new_models.append(model) probs = model.predict(self.X_train) predictions[:, i] = probs.reshape(M, ) #fit the super model to predictions: super_model.fit(predictions, self.y_train) return super_model, new_models
def run_test(): model = create_model(model_type=model_type, pretrained=pretrained, n_classes=n_classes, input_size=input_size, checkpoint=checkpoint) model = model.to(device) print(model) # count_flops(model, device=device) test_dataset = EvalDataset('./data/stanford-dogs/Processed/test') test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) model.eval() loss_func = nn.CrossEntropyLoss() acc_list, loss_list = [], [] with torch.no_grad(): for i, (inputs, labels) in enumerate(tqdm(test_dataloader)): inputs, labels = inputs.float().to(device), labels.to(device) preds = model(inputs) pred_idx = preds.max(1).indices acc = (pred_idx == labels).sum().item() / labels.size(0) acc_list.append(acc) loss = loss_func(preds, labels).item() loss_list.append(loss) final_loss = np.array(loss_list).mean() final_acc = np.array(acc_list).mean() print('Test loss: {}\nTest accuracy: {}'.format(final_loss, final_acc))
def setUp(self): admin_opts = {} fields = {'name': models.CharField(max_length=255)} self.dynamic_model = create_model('DynamicModel', fields=fields, app_label='context_admin', admin_opts={}) fields = { 'name': models.CharField(max_length=255), 'dynamic_model': models.ForeignKey(self.dynamic_model) } self.dynamic_inner_model = create_model('DynamicInnerModel', fields=fields, app_label='context_admin', admin_opts={}) install(self.dynamic_model) install(self.dynamic_inner_model) instance = self.dynamic_model() instance.save()
def evaluate_line(): config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) while True: # try: # line = input("请输入测试句子:") # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) # print(result) # except Exception as e: # logger.info(e) line = input("请输入测试句子:") result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def train(data_path, config): with tf.Graph().as_default(), tf.Session() as session: word_to_id_path = os.path.join(data_path, config.vocab_file) with open(word_to_id_path, "rb") as f: word_to_id = pickle.load(f) vocab_size = len(word_to_id) print("Vocab size: %d" % vocab_size) sys.stdout.flush() train_pattern = config.data_pattern.replace("{-type-}", "train") + ".part*" valid_pattern = config.data_pattern.replace("{-type-}", "valid") + ".part*" train_files = get_file_list(config, data_path, train_pattern, "train") valid_files = get_file_list(config, data_path, valid_pattern, "valid") if config.copy_temp: temp_dir = tempfile.mkdtemp() print("Copying data files to %s" % temp_dir) train_files = copy_temp_files(train_files, temp_dir) valid_files = copy_temp_files(valid_files, temp_dir) config.vocab_size = vocab_size train_batcher = PreBatched(train_files, config.batch_size, description="train") if config.use_prebatched \ else QueuedSequenceBatcher(train_files, config.seq_length, config.batch_size, description="train", attns=config.attention) valid_batcher = PreBatched(valid_files, config.batch_size, description="valid") if config.use_prebatched \ else QueuedSequenceBatcher(valid_files, config.seq_length, config.batch_size, description="valid", attns=config.attention) t0 = datetime.datetime.now() initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = create_model(config, True) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = create_model(config, False) summary_writer = tf.train.SummaryWriter(config.events_path, graph=session.graph) valid_perplexity = PerplexityHook(summary_writer, mvalid, valid_batcher) hooks = [ SpeedHook(summary_writer, config.status_iterations, config.batch_size), LossHook(summary_writer, config.status_iterations), valid_perplexity, SaveModelHook(config.checkpoint_path, 1, config.__dict__, 5) ] t1 = datetime.datetime.now() print("Building models took: %s" % (t1 - t0)) def load_func(): if config.model_path is not None: load_model(session, config.model_path) print("Continuing training from model: %s" % config.model_path) if config.embedding_path is not None: load_variables(session, os.path.join(config.embedding_path, "embedding.tf"), [m.embedding_variable]) print("Loading embedding vectors from: %s" % config.embedding_path) trainer = Trainer(m.optimizer, config.epochs, hooks, m, m.train_op) trainer(train_batcher, m.loss, session, config.learning_rate, config.lr_decay, load_func) saver = tf.train.Saver(tf.trainable_variables()) embedding_saver = tf.train.Saver([m.embedding_variable]) print("Saving model...") out_path = save_model(saver, session, config.save_path, m.predict, config.__dict__) embedding_saver.save(session, os.path.join(out_path, "embedding.tf")) if config.copy_temp: shutil.rmtree(temp_dir)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): #print batch step, batch_loss = model.run_step(sess, True, batch) #print step loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)