常数定义 ''' file_name = "File_Directory/results/{}.json".format(app_name) new_data_name = "{}_re_predict_data".format(app_name) new_result_name = "{}_re_predict_out".format(app_name) final_result_name = "{}_final_out".format(app_name) threshold = args["re_predict_threshold"] mix_rate = args['re_predict_mix_rate'] decay_rate = args['re_predict_decay_rate'] select_threshold = args['re_predict_select_threshold'] ''' 预测过程 ''' datasets = Dataset(logger=logger, args=param.get_config(param.DATASET)) datasets.load_examples() trainset, validset, testset = datasets.get_split() predict_preprocess = PreProcess(logger=logger, args=param.get_config(param.DATASET), examples=testset, for_prediction=True) predict_preprocess.prepare_batch_data(cache_filename="") predict_vocab_size = predict_preprocess.get_vocab_size() predict_batch_reader = predict_preprocess.batch_generator() predict_engine = PredictEngine(param=param, logger=logger, vocab_size=1) predict_engine.init_model(vocab_size=predict_vocab_size) predict_engine.predict(predict_batch_reader) example_info = util_tool.trans_exam_list_to_colum(testset) predict_engine.write_full_info(attach_data=example_info)
app_name = args["app_name"] # corpus_cleaner = Corpus_cleaner() # # corpus_cleaner.read_from_json("pretrain_corpus.json") # corpus_cleaner.read_from_src() # docs = corpus_cleaner.get_docs() # for i in range(10): # print(docs[i]) # print("###########################################################") # 读取数据集 datasets = Dataset(logger=logger, args=param.get_config(param.DATASET)) # datasets.read_dataset(div_nums=[7, 2, 1]) datasets.load_examples() trainset, validset, testset = datasets.get_split() # 这三个函数要修改,split应该检查是否已分割 # datasets.save_example() # 训练数据预处理 train_preprocess = PreProcess(logger=logger, args=param.get_config(param.DATASET), examples=trainset, feature_file_name='train_feature_for_multi_task') train_preprocess.convert_examples_to_features() train_vocab_size = train_preprocess.get_vocab_size() train_batch_reader = train_preprocess.batch_generator() # 验证数据预处理 valid_preprocess = PreProcess(logger=logger, args=param.get_config(param.DATASET), examples=validset, feature_file_name='valid_feature_for_multi_task') valid_preprocess.convert_examples_to_features() valid_vocab_size = valid_preprocess.get_vocab_size() valid_batch_reader = valid_preprocess.batch_generator()