Esempio n. 1
0
def create_and_train_model(config, tfconfig, fold):

    with tf.Session(config=tfconfig) as sess:
        # if args.tf:
        if config['run_op']['fix_random_seed']:
            tf.set_random_seed(12345)  # set random seed
            tf.logging.info("Fix random seed")
        model = BiGRU(sess, config, kbqa_flag)
        model.model_path = model_path
        model.log_path = log_path

        tf.logging.info("Print logging")
        print_config(config)
        time.sleep(3)

        if run == "train":
            run_op.train(model, config)
        elif run == "valid":
            run_op.valid_print(model, config)
        elif run == "test":
            # pass
            run_op.test(model, config, True, True)
        elif run == "test_kbqa":
            run_op.test(model, config, True, True, kbqa_flag=True)
        elif run == "vis_emb":
            run_op.visualization_emb(model, config)
        else:
            tf.logging.info("error in run! only accept train or test")
            exit(1)

        FileUtil.writeFile([model.model_path], "current_train_model_name.txt",
                           True)
def readSimpleQAData(filename):
    context = FileUtil.readFile(filename)
    items = []
    output = []
    for c in context:
        questionTriple = c.split("\t")
        question = tokenizer(questionTriple[-1].lower())
        output.append(questionTriple[0] + "\t" + questionTriple[1] + "\t" +
                      questionTriple[2] + "\t" + question)
    FileUtil.writeFile(output, "../data/" + filename[:-3] + "pre.txt")
Esempio n. 3
0
 def load_relation_emb(self, path):
     if path.endswith("npy"):
         return np.load(path)
     context = FileUtil.readFile(path)
     rel_embedding = []
     for t in context:
         data = t.split("\t")
         data = [float(x) for x in data]
         rel_embedding.append(data)
     return np.array(rel_embedding, dtype=np.float32)
Esempio n. 4
0
def merge_result():
    names = FileUtil.readFile("current_train_model_name.txt")
    # names = FileUtil.readFile("tl_model.txt")
    errors = []
    candidate_df = []
    for n in names:
        result_path = "{}/result.csv".format(n)
        temp, state = read_result(result_path)
        if state == True:
            candidate_df.append(temp)
        else:
            errors.append(state)
    concat_result = pd.concat(candidate_df, 0)
    output_path = "result.csv"
    # output_path = os.path.join("result.csv")
    concat_result.to_csv(output_path)
    return output_path, errors
Esempio n. 5
0
def test(model, config, final_test, write_file, kbqa_flag=False):

    model_location = None
    if final_test:
        init = tf.global_variables_initializer()
        model.sess.run(init)

        model_location = tf.train.latest_checkpoint((model.model_path))
        tf.logging.info("restore model {}".format(model_location))
        model.saver.restore(model.sess, model_location)

    # begin test!
    step = 1
    relation_detection_output = []
    qid = 0
    acc = 0

    model.qa.itemIndexTest = 0
    unseen_relation_output = []
    unseen_all = 0
    unseen_acc = 0

    relation_output = []

    word_big_part = 0
    word_small_part = 0
    word_equal_part = 0
    all_para_num = 0

    choose_seen_error = 0
    choose_unseen_error = 0

    kbqa_acc = 0
    unseen_kbqa_acc = 0

    seen_macro_output = []
    unseen_macro_output = []

    macro_output_for_seen_rate = []

    while (step - 1) * model.dev_batch_size < model.testing_iters:

        ss = time.time()
        if kbqa_flag:
            value = model.qa.load_test_data(model.dev_batch_size, "test",
                                            kbqa_flag)
        else:
            value = model.qa.load_test_data(model.dev_batch_size)

        feed = {
            model.question_ids: value['batch_x_anonymous'],
            model.relation_index: value['batch_relation_index'],
            model.relation_lens: value['batch_relation_lens'],
            model.x_lens: value['batch_x_anonymous_lens'],
            model.is_training: False,
        }

        wo_cand_rel, score, rel_word_vec, rel_part_vec = model.sess.run(
            [
                model.rel_pred, model.rel_score, model.word_test,
                model.part_test
            ],
            feed_dict=feed)

        rel_word_vec = np.max(rel_word_vec, 1)
        row, col = rel_word_vec.shape
        all_para_num += row * col
        rel_part_vec = np.max(rel_part_vec, 1)
        word_big_part += np.sum(rel_word_vec > rel_part_vec)
        word_small_part += np.sum(rel_word_vec < rel_part_vec)
        word_equal_part += np.sum(rel_word_vec == rel_part_vec)

        for i in range(value['batch_size']):
            temp_gold_relation = value['gold_relation'][i]

            if wo_cand_rel[i] >= len(value['cand_rel_list'][i]):
                qid += 1
                relation_output.append("{}\t{}".format(temp_gold_relation,
                                                       "oov"))
                continue

            qid += 1
            pre = value['cand_rel_list'][i][wo_cand_rel[i]]
            current_query = value['questions'][i]
            out_str = "{}\t{}\t{}\t{}".format(
                value['qids'][i], current_query, model.qa.rel_voc[pre],
                model.qa.rel_voc[temp_gold_relation])

            relation_detection_output.append(out_str)
            unseen = False

            if temp_gold_relation not in model.train_relation:
                unseen_all += 1
                unseen = True
                unseen_relation_output.append(out_str)

            if temp_gold_relation in model.train_relation:
                seen_macro_output.append("{}\t{}".format(
                    temp_gold_relation, pre))
            else:
                unseen_macro_output.append("{}\t{}".format(
                    temp_gold_relation, pre))

            if unseen and pre in model.train_relation:
                choose_seen_error += 1

            if unseen:
                if pre in model.train_relation:
                    macro_output_for_seen_rate.append("{}\t{}".format(
                        temp_gold_relation, 1))
                else:
                    macro_output_for_seen_rate.append("{}\t{}".format(
                        temp_gold_relation, 0))

            if (not unseen) and (pre not in model.train_relation):
                choose_unseen_error += 1
            if temp_gold_relation == pre:
                acc += 1
                if unseen:
                    unseen_acc += 1
            else:
                pass

            if kbqa_flag:
                gold_subject = value['gold_subject'][i]
                if temp_gold_relation == pre and gold_subject in model.qa.subject2relation and pre in model.qa.subject2relation[
                        gold_subject]:
                    kbqa_acc += 1
                    if unseen:
                        unseen_kbqa_acc += 1

        if (step - 1) % model.display_step == 0:
            tf.logging.info("rate:\t%d/%d" %
                            (step,
                             (model.testing_iters / model.dev_batch_size)))
            ee = time.time()
            tf.logging.info("time:\t" + str(ee - ss))

        step += 1
    unseen_error = unseen_all - unseen_acc
    seen_all = qid - unseen_all
    seen_error = (qid - unseen_all) - (acc - unseen_acc)
    assert all_para_num == (word_big_part + word_small_part + word_equal_part)

    # prepare result
    output_dic = {}
    output_columns = [
        "model_name", "test_seen", "seen_macro_acc", "test_unseen",
        "unseen_macro_acc", "all", "all_macro_acc", "location", "server",
        "kbqa_acc", 'unseen_choose_seen_macro', "unseen_relation_num", "time"
    ]

    output_dic['unseen_choose_seen_macro'] = cal_macro_acc_seen_rate(
        macro_output_for_seen_rate)

    tf.logging.info("para composition big:equal:small = {}:{}:{}".format(
        word_big_part, word_equal_part, word_small_part))

    tf.logging.info("qid {} test_len {}".format(qid, len(model.qa.test_data)))
    assert qid == len(model.qa.test_data)

    now = time.asctime(time.localtime(time.time()))
    output_dic['time'] = (now)

    true_relation_score = cal_macro_acc(relation_output)
    output_dic['seen_macro_acc'] = cal_macro_acc(seen_macro_output)
    output_dic['unseen_macro_acc'] = cal_macro_acc(unseen_macro_output)
    output_dic['all_macro_acc'] = cal_macro_acc(seen_macro_output +
                                                unseen_macro_output)

    all_acc = acc * 1.0 / qid
    output_dic['all'] = all_acc

    kbqa_acc = kbqa_acc * 1. / qid
    output_dic['kbqa_acc'] = kbqa_acc

    output_dic['unseen_relation_num'] = unseen_all

    if qid - unseen_all == 0:
        seen_acc = 0
    else:
        seen_acc = (acc - unseen_acc) * 1.0 / (qid - unseen_all)
    output_dic['test_seen'] = seen_acc

    if unseen_all != 0:
        unseen_acc = unseen_acc * 1.0 / unseen_all
    output_dic['test_unseen'] = unseen_acc

    tf.logging.info("all {} seen {} unseen acc {}".format(
        all_acc, seen_acc, unseen_acc))

    output_dic['location'] = model_location

    model_name = config['model']['name']
    output_dic['model_name'] = model_name

    csv_path = "{}/result.csv".format(model.model_path)
    if final_test:
        result_util.write_result(output_dic, output_columns, csv_path)

    if write_file:
        FileUtil.writeFile(unseen_relation_output,
                           "{}/unseen.output.txt".format(model.model_path))
        FileUtil.writeFile(relation_detection_output,
                           "{}/all.output.txt".format(model.model_path))

    if not final_test:
        return acc * 1.0 / qid, seen_acc, unseen_acc, true_relation_score
Esempio n. 6
0
parse.add_argument("--same_tl", action="store_true", help="run same train len")
parse.add_argument("--star_model",
                   action="store_true",
                   help="run same relation size")

parse.add_argument("--test_raw_model",
                   action="store_true",
                   help="test_raw_model")
args = parse.parse_args()

tf.logging.info("Use tf")
from src.network import BiGRU
from src import run_op as run_op

run = args.run
config = FileUtil.load_from_config(args.config)


def create_dir(path):
    if not os.path.exists(path):
        os.system("mkdir -p {}".format(path))


default_model_dir = "/home/wup/qa+adapter_new_dev/model"
if "model_dir" in config['run_op']:
    default_model_dir = config['run_op']['model_dir']

default_log_dir = "/home/wup/qa+adapter_new_dev/log"
if "log_dir" in config['run_op']:
    default_log_dir = config['run_op']['log_dir']
class PortugueseTextualProcessing:
    NLP = spacy.load('pt_core_news_md')
    STOPWORDS = set(nltk.corpus.stopwords.words('portuguese'))
    CUSTOM_STOPWORDS = FileUtil().get_words_from_file(
        './resources/custom_stopwords.txt')
    TAGGER = load(open('./util/pttag-mm.pkl', 'rb'))
    EMBEDDING_DIM = 100
    MAX_NUM_WORDS = 20000
    LONG_SENTENCE_SIZE = 12
    SHORT_SENTENCE_SIZE = 6
    PT_DICT = pyphen.Pyphen(lang='pt_BR')
    SILVA_SYLLABLE_SEPARATOR = Silva2011SyllableSeparator()
    NER_PT_TAGGER = FileUtil().load_ner_pickle()

    NER_TIME_TAGGER = FileUtil().load_ner_pickle(
        './resources/cat-entropy_cutoff_0.08.pickle')
    LOGICAL_OPERATORS = [
        'e', 'nada', 'a menos que', 'ou', 'nunca', 'sem que', 'não', 'jamais',
        'nem'
        'caso', 'se', 'nenhum', 'nenhuma', 'então é porque', 'desde que',
        'contanto que', 'uma vez que', 'fosse'
    ]
    CONTENT_TAGS = ['N', 'ADJ', 'ADV', 'V']
    FUNCTIONAL_TAGS = ['ART', 'PREP', 'PRON', 'K']
    DEFAULT_NOT_FOUND_TAG = 'notfound'
    CASE_SENSITIVE_PATTERN = '[A-Z][a-z]*'
    NUMBER_ONLY_PATTERN = '[0-9]'
    RICH_TAG_TYPES = [
        'Gender', 'Number', 'Person', 'PronType', 'VerbForm', 'Tense'
    ]

    def __init__(self):
        pass

    @staticmethod
    def tokenize(text):
        tokens = tokenize.word_tokenize(text, language='portuguese')
        slash_tokens = [i for i in tokens if '/' in i]
        if slash_tokens:
            PortugueseTextualProcessing().separate_slash_tokens(
                tokens, slash_tokens)
        return tokens

    @staticmethod
    def count_lemmas(text):
        doc = PortugueseTextualProcessing.NLP(text)
        return len([token for token in doc if token.text != token.lemma_])

    @staticmethod
    def get_rich_tags(text):
        doc = PortugueseTextualProcessing.NLP(text)
        tags = [(token.lemma_, token.pos_, token.tag_) for token in doc]
        tagged_text = ''.join([tag[2] for tag in tags
                               if "|" in tag[2]]).split("|")
        return PortugueseTextualProcessing().extract_tags(tagged_text)

    @staticmethod
    def extract_tags(tagged_text):
        tags = []
        for tag in PortugueseTextualProcessing().RICH_TAG_TYPES:
            if tag != 'Person':
                type_tags = PortugueseTextualProcessing().get_regular_tags(
                    tag,
                    PortugueseTextualProcessing().CASE_SENSITIVE_PATTERN,
                    tagged_text)
            else:
                type_tags = PortugueseTextualProcessing().get_regular_tags(
                    tag,
                    PortugueseTextualProcessing().NUMBER_ONLY_PATTERN,
                    tagged_text)
            tags.append(
                RichTagFrequency(
                    tag, ''.join(type_tags).replace(tag + '=',
                                                    ' ').split(' ')[1:]))

        return tags

    @staticmethod
    def get_regular_tags(pattern, case, tagged_text):
        return re.findall(pattern + '=' + case, ''.join(tagged_text))
Esempio n. 8
0
 def test_zip_dir(self):
     FileUtil.zip_dir("c:/tools", "c:/tools.zip")
     pass