def create_and_train_model(config, tfconfig, fold): with tf.Session(config=tfconfig) as sess: # if args.tf: if config['run_op']['fix_random_seed']: tf.set_random_seed(12345) # set random seed tf.logging.info("Fix random seed") model = BiGRU(sess, config, kbqa_flag) model.model_path = model_path model.log_path = log_path tf.logging.info("Print logging") print_config(config) time.sleep(3) if run == "train": run_op.train(model, config) elif run == "valid": run_op.valid_print(model, config) elif run == "test": # pass run_op.test(model, config, True, True) elif run == "test_kbqa": run_op.test(model, config, True, True, kbqa_flag=True) elif run == "vis_emb": run_op.visualization_emb(model, config) else: tf.logging.info("error in run! only accept train or test") exit(1) FileUtil.writeFile([model.model_path], "current_train_model_name.txt", True)
def readSimpleQAData(filename): context = FileUtil.readFile(filename) items = [] output = [] for c in context: questionTriple = c.split("\t") question = tokenizer(questionTriple[-1].lower()) output.append(questionTriple[0] + "\t" + questionTriple[1] + "\t" + questionTriple[2] + "\t" + question) FileUtil.writeFile(output, "../data/" + filename[:-3] + "pre.txt")
def load_relation_emb(self, path): if path.endswith("npy"): return np.load(path) context = FileUtil.readFile(path) rel_embedding = [] for t in context: data = t.split("\t") data = [float(x) for x in data] rel_embedding.append(data) return np.array(rel_embedding, dtype=np.float32)
def merge_result(): names = FileUtil.readFile("current_train_model_name.txt") # names = FileUtil.readFile("tl_model.txt") errors = [] candidate_df = [] for n in names: result_path = "{}/result.csv".format(n) temp, state = read_result(result_path) if state == True: candidate_df.append(temp) else: errors.append(state) concat_result = pd.concat(candidate_df, 0) output_path = "result.csv" # output_path = os.path.join("result.csv") concat_result.to_csv(output_path) return output_path, errors
def test(model, config, final_test, write_file, kbqa_flag=False): model_location = None if final_test: init = tf.global_variables_initializer() model.sess.run(init) model_location = tf.train.latest_checkpoint((model.model_path)) tf.logging.info("restore model {}".format(model_location)) model.saver.restore(model.sess, model_location) # begin test! step = 1 relation_detection_output = [] qid = 0 acc = 0 model.qa.itemIndexTest = 0 unseen_relation_output = [] unseen_all = 0 unseen_acc = 0 relation_output = [] word_big_part = 0 word_small_part = 0 word_equal_part = 0 all_para_num = 0 choose_seen_error = 0 choose_unseen_error = 0 kbqa_acc = 0 unseen_kbqa_acc = 0 seen_macro_output = [] unseen_macro_output = [] macro_output_for_seen_rate = [] while (step - 1) * model.dev_batch_size < model.testing_iters: ss = time.time() if kbqa_flag: value = model.qa.load_test_data(model.dev_batch_size, "test", kbqa_flag) else: value = model.qa.load_test_data(model.dev_batch_size) feed = { model.question_ids: value['batch_x_anonymous'], model.relation_index: value['batch_relation_index'], model.relation_lens: value['batch_relation_lens'], model.x_lens: value['batch_x_anonymous_lens'], model.is_training: False, } wo_cand_rel, score, rel_word_vec, rel_part_vec = model.sess.run( [ model.rel_pred, model.rel_score, model.word_test, model.part_test ], feed_dict=feed) rel_word_vec = np.max(rel_word_vec, 1) row, col = rel_word_vec.shape all_para_num += row * col rel_part_vec = np.max(rel_part_vec, 1) word_big_part += np.sum(rel_word_vec > rel_part_vec) word_small_part += np.sum(rel_word_vec < rel_part_vec) word_equal_part += np.sum(rel_word_vec == rel_part_vec) for i in range(value['batch_size']): temp_gold_relation = value['gold_relation'][i] if wo_cand_rel[i] >= len(value['cand_rel_list'][i]): qid += 1 relation_output.append("{}\t{}".format(temp_gold_relation, "oov")) continue qid += 1 pre = value['cand_rel_list'][i][wo_cand_rel[i]] current_query = value['questions'][i] out_str = "{}\t{}\t{}\t{}".format( value['qids'][i], current_query, model.qa.rel_voc[pre], model.qa.rel_voc[temp_gold_relation]) relation_detection_output.append(out_str) unseen = False if temp_gold_relation not in model.train_relation: unseen_all += 1 unseen = True unseen_relation_output.append(out_str) if temp_gold_relation in model.train_relation: seen_macro_output.append("{}\t{}".format( temp_gold_relation, pre)) else: unseen_macro_output.append("{}\t{}".format( temp_gold_relation, pre)) if unseen and pre in model.train_relation: choose_seen_error += 1 if unseen: if pre in model.train_relation: macro_output_for_seen_rate.append("{}\t{}".format( temp_gold_relation, 1)) else: macro_output_for_seen_rate.append("{}\t{}".format( temp_gold_relation, 0)) if (not unseen) and (pre not in model.train_relation): choose_unseen_error += 1 if temp_gold_relation == pre: acc += 1 if unseen: unseen_acc += 1 else: pass if kbqa_flag: gold_subject = value['gold_subject'][i] if temp_gold_relation == pre and gold_subject in model.qa.subject2relation and pre in model.qa.subject2relation[ gold_subject]: kbqa_acc += 1 if unseen: unseen_kbqa_acc += 1 if (step - 1) % model.display_step == 0: tf.logging.info("rate:\t%d/%d" % (step, (model.testing_iters / model.dev_batch_size))) ee = time.time() tf.logging.info("time:\t" + str(ee - ss)) step += 1 unseen_error = unseen_all - unseen_acc seen_all = qid - unseen_all seen_error = (qid - unseen_all) - (acc - unseen_acc) assert all_para_num == (word_big_part + word_small_part + word_equal_part) # prepare result output_dic = {} output_columns = [ "model_name", "test_seen", "seen_macro_acc", "test_unseen", "unseen_macro_acc", "all", "all_macro_acc", "location", "server", "kbqa_acc", 'unseen_choose_seen_macro', "unseen_relation_num", "time" ] output_dic['unseen_choose_seen_macro'] = cal_macro_acc_seen_rate( macro_output_for_seen_rate) tf.logging.info("para composition big:equal:small = {}:{}:{}".format( word_big_part, word_equal_part, word_small_part)) tf.logging.info("qid {} test_len {}".format(qid, len(model.qa.test_data))) assert qid == len(model.qa.test_data) now = time.asctime(time.localtime(time.time())) output_dic['time'] = (now) true_relation_score = cal_macro_acc(relation_output) output_dic['seen_macro_acc'] = cal_macro_acc(seen_macro_output) output_dic['unseen_macro_acc'] = cal_macro_acc(unseen_macro_output) output_dic['all_macro_acc'] = cal_macro_acc(seen_macro_output + unseen_macro_output) all_acc = acc * 1.0 / qid output_dic['all'] = all_acc kbqa_acc = kbqa_acc * 1. / qid output_dic['kbqa_acc'] = kbqa_acc output_dic['unseen_relation_num'] = unseen_all if qid - unseen_all == 0: seen_acc = 0 else: seen_acc = (acc - unseen_acc) * 1.0 / (qid - unseen_all) output_dic['test_seen'] = seen_acc if unseen_all != 0: unseen_acc = unseen_acc * 1.0 / unseen_all output_dic['test_unseen'] = unseen_acc tf.logging.info("all {} seen {} unseen acc {}".format( all_acc, seen_acc, unseen_acc)) output_dic['location'] = model_location model_name = config['model']['name'] output_dic['model_name'] = model_name csv_path = "{}/result.csv".format(model.model_path) if final_test: result_util.write_result(output_dic, output_columns, csv_path) if write_file: FileUtil.writeFile(unseen_relation_output, "{}/unseen.output.txt".format(model.model_path)) FileUtil.writeFile(relation_detection_output, "{}/all.output.txt".format(model.model_path)) if not final_test: return acc * 1.0 / qid, seen_acc, unseen_acc, true_relation_score
parse.add_argument("--same_tl", action="store_true", help="run same train len") parse.add_argument("--star_model", action="store_true", help="run same relation size") parse.add_argument("--test_raw_model", action="store_true", help="test_raw_model") args = parse.parse_args() tf.logging.info("Use tf") from src.network import BiGRU from src import run_op as run_op run = args.run config = FileUtil.load_from_config(args.config) def create_dir(path): if not os.path.exists(path): os.system("mkdir -p {}".format(path)) default_model_dir = "/home/wup/qa+adapter_new_dev/model" if "model_dir" in config['run_op']: default_model_dir = config['run_op']['model_dir'] default_log_dir = "/home/wup/qa+adapter_new_dev/log" if "log_dir" in config['run_op']: default_log_dir = config['run_op']['log_dir']
class PortugueseTextualProcessing: NLP = spacy.load('pt_core_news_md') STOPWORDS = set(nltk.corpus.stopwords.words('portuguese')) CUSTOM_STOPWORDS = FileUtil().get_words_from_file( './resources/custom_stopwords.txt') TAGGER = load(open('./util/pttag-mm.pkl', 'rb')) EMBEDDING_DIM = 100 MAX_NUM_WORDS = 20000 LONG_SENTENCE_SIZE = 12 SHORT_SENTENCE_SIZE = 6 PT_DICT = pyphen.Pyphen(lang='pt_BR') SILVA_SYLLABLE_SEPARATOR = Silva2011SyllableSeparator() NER_PT_TAGGER = FileUtil().load_ner_pickle() NER_TIME_TAGGER = FileUtil().load_ner_pickle( './resources/cat-entropy_cutoff_0.08.pickle') LOGICAL_OPERATORS = [ 'e', 'nada', 'a menos que', 'ou', 'nunca', 'sem que', 'não', 'jamais', 'nem' 'caso', 'se', 'nenhum', 'nenhuma', 'então é porque', 'desde que', 'contanto que', 'uma vez que', 'fosse' ] CONTENT_TAGS = ['N', 'ADJ', 'ADV', 'V'] FUNCTIONAL_TAGS = ['ART', 'PREP', 'PRON', 'K'] DEFAULT_NOT_FOUND_TAG = 'notfound' CASE_SENSITIVE_PATTERN = '[A-Z][a-z]*' NUMBER_ONLY_PATTERN = '[0-9]' RICH_TAG_TYPES = [ 'Gender', 'Number', 'Person', 'PronType', 'VerbForm', 'Tense' ] def __init__(self): pass @staticmethod def tokenize(text): tokens = tokenize.word_tokenize(text, language='portuguese') slash_tokens = [i for i in tokens if '/' in i] if slash_tokens: PortugueseTextualProcessing().separate_slash_tokens( tokens, slash_tokens) return tokens @staticmethod def count_lemmas(text): doc = PortugueseTextualProcessing.NLP(text) return len([token for token in doc if token.text != token.lemma_]) @staticmethod def get_rich_tags(text): doc = PortugueseTextualProcessing.NLP(text) tags = [(token.lemma_, token.pos_, token.tag_) for token in doc] tagged_text = ''.join([tag[2] for tag in tags if "|" in tag[2]]).split("|") return PortugueseTextualProcessing().extract_tags(tagged_text) @staticmethod def extract_tags(tagged_text): tags = [] for tag in PortugueseTextualProcessing().RICH_TAG_TYPES: if tag != 'Person': type_tags = PortugueseTextualProcessing().get_regular_tags( tag, PortugueseTextualProcessing().CASE_SENSITIVE_PATTERN, tagged_text) else: type_tags = PortugueseTextualProcessing().get_regular_tags( tag, PortugueseTextualProcessing().NUMBER_ONLY_PATTERN, tagged_text) tags.append( RichTagFrequency( tag, ''.join(type_tags).replace(tag + '=', ' ').split(' ')[1:])) return tags @staticmethod def get_regular_tags(pattern, case, tagged_text): return re.findall(pattern + '=' + case, ''.join(tagged_text))
def test_zip_dir(self): FileUtil.zip_dir("c:/tools", "c:/tools.zip") pass