Esempio n. 1
0
 def save(self, session, dir_path):
     import os
     if not (os.path.isdir(dir_path)):
         os.mkdir(dir_path)
     fp = dir_path + "/best_model"
     self.saver.save(session, fp)
     LogInfo.logs("Model saved into %s.", fp)
Esempio n. 2
0
    def eval_avg(self, setting=1):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using setting %d...",
            setting)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, setting, 'q')
            opt1_vec = self.get_repr(option1, ask4, setting, 'o')
            opt2_vec = self.get_repr(option2, ask4, setting, 'o')
            score1 = self.get_similarity(sent_vec, opt1_vec)
            score2 = self.get_similarity(sent_vec, opt2_vec)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
Esempio n. 3
0
def load_kkv_table(file_path):
    kkv_table = dict()
    with codecs.open(file_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            spt = line.strip().split()
            if len(spt) < 3:
                LogInfo.logs("[error] bad line: %s", line.strip())
            kkv_table[spt[0] + ' ' + spt[1]] = spt[2]
    return kkv_table
Esempio n. 4
0
    def create_batches(self):
        if self.data_size % self.batch_size == 0:
            self.num_batches = int(self.data_size / self.batch_size)
        else:
            self.num_batches = int(self.data_size / self.batch_size) + 1

        # When the data (tensor) is too small, let's give them a better error message
        if self.num_batches == 0:
            assert False, "Not enough data. Make seq_length and batch_size small."
        LogInfo.logs("Batches created. (%d)", self.num_batches)
Esempio n. 5
0
 def add_pinlei_tag_yyh(self):
     LogInfo.begin_track("Begin adding tags for pinleis...")
     fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt",
                       'r',
                       encoding='utf-8')
     fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag",
                        'w',
                        encoding='utf-8')
     cnt = 0
     for line in fin:
         spt = line.strip().split()
         new_line = ""
         i = 0
         while i < len(spt):
             if i + 3 < len(spt):
                 str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3]
                 if str4 in self.pinlei_set:
                     LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                  spt[i], spt[i + 1], spt[i + 2],
                                  spt[i + 3])
                     new_line += "[[" + str4 + "]] "
                     i += 4
                     continue
             if i + 2 < len(spt):
                 str3 = spt[i] + spt[i + 1] + spt[i + 2]
                 if str3 in self.pinlei_set:
                     # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]",
                     #              spt[i], spt[i+1], spt[i+2])
                     new_line += "[[" + str3 + "]] "
                     i += 3
                     continue
             if i + 1 < len(spt):
                 str2 = spt[i] + spt[i + 1]
                 if str2 in self.pinlei_set:
                     # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                     #              spt[i], spt[i+1])
                     new_line += "[[" + str2 + "]] "
                     i += 2
                     continue
             if spt[i] in self.pinlei_set:
                 # LogInfo.logs("Found pinlei [%s]", spt[i])
                 new_line += "[[" + spt[i] + "]] "
                 i += 1
                 continue
             new_line += spt[i] + " "
             i += 1
         fout.write(new_line + "\n")
         cnt += 1
         if cnt < 5:
             LogInfo.logs("res ==> (%s)", new_line)
         LogInfo.show_line(cnt, 100000)
     fin.close()
     fout.close()
     LogInfo.end_track("Pinlei tags added.")
Esempio n. 6
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on ROC using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        for i in range(0, 1871):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 1871, correct, 1871)
        LogInfo.end_track()
Esempio n. 7
0
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     if os.path.isfile(data_file):
         LogInfo.begin_track("[Exist] Loading from %s...", data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         with codecs.open(data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 spt = line.strip().split("\t")
                 query_idxs.append([int(idx) for idx in spt[0].split(" ")])
                 query_lens.append(int(spt[1]))
                 labels.append([int(idx) for idx in spt[2].split(" ")])
                 intents.append(int(spt[3]))
                 link_masks.append([int(idx) for idx in spt[4].split(" ")])
                 entity_idxs.append([int(idx) for idx in spt[5].split(" ")])
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len)
     else:
         txt_data_file = data_file + ".name"
         LogInfo.begin_track("[Not Exist] Loading from %s...",
                             txt_data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         fout = codecs.open(data_file, 'w', encoding=encoding)
         with codecs.open(txt_data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 query_idx, query_len, label, intent, link_mask, entity_idx\
                     = self.decode_line(line)
                 fout.write(" ".join([str(x) for x in query_idx]) + "\t" +
                            str(query_len) + "\t" +
                            " ".join([str(x) for x in label]) + "\t" +
                            str(intent) + "\t" +
                            " ".join([str(x) for x in link_mask]) + "\t" +
                            " ".join([str(x) for x in entity_idx]) + "\n")
                 query_idxs.append(query_idx)
                 query_lens.append(query_len)
                 labels.append(label)
                 intents.append(intent)
                 link_masks.append(link_mask)
                 entity_idxs.append(entity_idx)
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         fout.close()
         LogInfo.logs("Write into %s.", data_file)
         LogInfo.end_track("Max_seq_len = %d.", self.max)
     self.data = list(
         zip(query_idxs, query_lens, labels, intents, link_masks,
             entity_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track("Loaded. Size: %d.", self.data_size)
Esempio n. 8
0
    def tag_pinlei(self, query):
        LogInfo.logs("Tagging pinlei for your query...")
        spt = query.strip().split()
        new_line = ""
        context = ""
        label = set()
        i = 0
        while i < len(spt):
            if i + 4 < len(spt):
                str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] + spt[i +
                                                                           4]
                if "[[" + str5 + "]]" in self.pinlei:
                    LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]",
                                 spt[i], spt[i + 1], spt[i + 2], spt[i + 3],
                                 spt[i + 4])
                    label.add("[[" + str5 + "]]")
                    new_line += "[[" + str5 + "]] "
                    i += 5
                    continue
            if i + 3 < len(spt):
                str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3]
                if "[[" + str4 + "]]" in self.pinlei:
                    LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i],
                                 spt[i + 1], spt[i + 2], spt[i + 3])
                    label.add("[[" + str4 + "]]")
                    new_line += "[[" + str4 + "]] "
                    i += 4
                    continue
            if i + 2 < len(spt):
                str3 = spt[i] + spt[i + 1] + spt[i + 2]
                if "[[" + str3 + "]]" in self.pinlei:
                    LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i],
                                 spt[i + 1], spt[i + 2])
                    label.add("[[" + str3 + "]]")
                    new_line += "[[" + str3 + "]] "
                    i += 3
                    continue
            if i + 1 < len(spt):
                str2 = spt[i] + spt[i + 1]
                if "[[" + str2 + "]]" in self.pinlei:
                    # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                    #              spt[i], spt[i+1])
                    label.add("[[" + str2 + "]]")
                    new_line += "[[" + str2 + "]] "
                    i += 2
                    continue
            if "[[" + spt[i] + "]]" in self.pinlei:
                # LogInfo.logs("Found pinlei [%s]", spt[i])
                label.add("[[" + spt[i] + "]]")
                new_line += "[[" + spt[i] + "]] "
                i += 1
                continue
            context += spt[i] + " "
            new_line += spt[i] + " "
            i += 1

        return new_line.strip(), context.strip(), list(label)
Esempio n. 9
0
    def eval_avg_lambda(self, lamb=1.0):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using lambda %.2f...",
            lamb)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, 1, 'q')
            opt1_vec = self.get_repr(option1, ask4, 1, 'o')
            opt2_vec = self.get_repr(option2, ask4, 1, 'o')
            score1a = self.get_similarity(sent_vec, opt1_vec)
            score2a = self.get_similarity(sent_vec, opt2_vec)

            sent_vec = self.get_repr(sentence, ask4, 2, 'q')
            opt1_vec = self.get_repr(option1, ask4, 2, 'o')
            opt2_vec = self.get_repr(option2, ask4, 2, 'o')
            score1b = self.get_similarity(sent_vec, opt1_vec)
            score2b = self.get_similarity(sent_vec, opt2_vec)

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            # LogInfo.logs("[log] %.4f(%.2f^%.2f*%.2f^%.2f) ||| %.4f(%.2f^%.2f*%.2f^%.2f)",
            #              score1, score1a, lamb, score1b, 1-lamb,
            #              score2, score2a, lamb, score2b, 1-lamb)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
Esempio n. 10
0
def main():
    copa, worddic = readcopa()
    label = readlabel()
    cdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt")
    enegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt")
    cnegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt")
    edic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt")

    verbose = False

    import sys
    mode = sys.argv[1]
    if mode == 'full':
        for ratio in range(21):
            for lamd in range(11):
                acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, 4, True, ratio*0.1, verbose)
                print ratio*0.1, lamd*0.1, acc
        # print "word pair with norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, True, verbose)
        #         print lamd*0.1, setting, acc

        # print "\nword pair without norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, False, verbose)
        #         print lamd*0.1, setting, acc
        #
        # print "\nsentence level with norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, True)
        #     print lamd*0.1,  acc
        #
        # print "\nsentence level without norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, False)
        #     print lamd*0.1, acc
    elif mode == 'case':
        para1 = float(sys.argv[2])
        para2 = int(sys.argv[3])
        LogInfo.begin_track("case tracing for word-pair & lambda=%.1f, setting=%d:", para1, para2)
        verbose = True
        acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, para1, 500, para2, True, verbose)
        LogInfo.logs("[Accuracy] %.4f", acc)
        LogInfo.end_track()
Esempio n. 11
0
    def prepare_model_data(self, pinlei_num):
        self.pinlei_num = pinlei_num
        LogInfo.begin_track("Generate Multi-Pinlei Data for evaluation...")
        fin = codecs.open(self.root_fp + "/query_label.txt." +
                          str(self.pinlei_num),
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".name",
                           'w',
                           encoding='utf-8')
        fsho = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".check",
                           'w',
                           encoding='utf-8')
        cnt = 0
        not_cover = set()
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            is_cover = True
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                if pinlei not in self.pinlei:
                    # LogInfo.logs("%s not cover.", pinlei)
                    is_cover = False
                    not_cover.add(pinlei)
            if not is_cover:
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                continue
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                fout.write(context + "\t" + pinlei + "\n")
                fsho.write(spt[0] + "\n")

        fin.close()
        fout.close()
        fsho.close()
        LogInfo.end_track("%d pinlei not cover.", len(not_cover))
Esempio n. 12
0
def load_configs(fp):
    LogInfo.begin_track('Loading config from %s: ', fp)
    config_dict = {}
    with open(fp, 'r') as br:
        for line in br.readlines():
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            if line.find('\t') == -1:
                continue
            spt = line.split('\t')
            if len(spt) < 3:
                LogInfo.logs("[%s] is invalid, pls add type!", line)
                continue
            k = spt[0]
            v_str = spt[1]
            t = spt[2]
            if t == "d" or t == "int":
                config_dict[k] = int(v_str)
            elif t == "f" or t == "float" or t == "double":
                config_dict[k] = float(v_str)
            elif t == "b" or t == "bool":
                if v_str == "true" or v_str == "True" \
                        or v_str == "TRUE" or v_str == "1":
                    config_dict[k] = True
                else:
                    config_dict[k] = False
            elif t == "tf" or t == "tensorflow":
                if v_str == 'relu':
                    config_dict[k] = tf.nn.relu
                elif v_str == 'sigmoid':
                    config_dict[k] = tf.nn.sigmoid
                elif v_str == 'tanh':
                    config_dict[k] = tf.nn.tanh
            elif t == "None" or v_str == "None":
                config_dict[k] = None
            else:
                config_dict[k] = v_str
            LogInfo.logs('%s = %s', k, v_str)

    LogInfo.end_track()
    return config_dict
Esempio n. 13
0
def cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose=False):
    score = 0
    num = 0
    rcause = []
    reffect = []
    for word in cause:
        if word in cdic and word in cnegdic:
            rcause.append(word)
    for word in effect:
        if word in edic and word in enegdic:
            reffect.append(word)

    sort_map = dict()
    for wordc in rcause:
        for worde in reffect:
            if wordc == worde:
               continue
            score_suf = get_similar(cdic[wordc], enegdic[worde], norm)
            score_nec = get_similar(cnegdic[wordc], enegdic[worde], norm)
            tmp = lamd * score_suf + (1-lamd) * score_nec
            # check reverse
            score_reverse = get_similar(cdic[worde], enegdic[wordc], norm)
            if abs(score_suf-score_reverse) / min(abs(score_suf), abs(score_reverse)) < ratio:
                continue
            score += tmp
            num += 1
            tmp_str = "[%s]-[%s] ==> %.1f*[%.2f]+%.1f*[%.2f]=[%.4f]" % \
                      (wordc, worde, lamd, score_suf, 1-lamd, score_nec, tmp)
            sort_map[tmp] = tmp_str

    if verbose:
        for line in [sort_map[k] for k in sorted(sort_map.keys(), reverse=True)]:
            LogInfo.logs(line)

    if setting == 1:
        return score
    elif setting == 2:
        if verbose:
            LogInfo.logs("%.4f / (%d+%d=%d) = %.4f", score,
                         len(rcause), len(reffect), len(rcause)+len(reffect),
                         score / (len(rcause) + len(reffect)))
        return score/(len(rcause)+len(reffect))
    elif setting == 3:
        return score/(len(rcause)*len(reffect))
    elif num == 0:
        if verbose:
            LogInfo.logs("%.4f / %d = %.4f", score, 0, 0.0)
        return 0.0
    else:
        if verbose:
            LogInfo.logs("%.4f / %d = %.4f", score, num, score/num)
        return score/num
Esempio n. 14
0
def fuzzy_match_name(mention, vocab, PN):
    """
    :param mention: list of strings
    :param vocab: list of (string, set) tuple
    :param PN: number of candidates = PN-1
    :return: list of strings with size PN-1
    """
    m_set = set()
    for ch in mention:
        m_set.add(ch)
    # LogInfo.begin_track("generate for %s [%s]...", mention, m_set)
    rank_list = TopKRankedList(PN - 1)
    for name, c_set in vocab.items():
        score = get_jaccard_score(m_set, c_set)
        # LogInfo.logs("%s [%s] : %.4f", name, c_set, score)
        if score == 1.0:
            continue
        rank_list.push((name, score))
    LogInfo.logs("Cands for %s: [%s]", mention,
                 "|".join(rank_list.top_names()))
    # LogInfo.end_track()
    return rank_list.top_names()
Esempio n. 15
0
    def prepare_model_data(self):
        LogInfo.begin_track("Generate model data...")
        # .1 means single pinlei
        fin = codecs.open(self.root_fp + "/query_label.txt.1",
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_train.name",
                           'w',
                           encoding='utf-8')
        not_cover = 0
        not_context = 0
        cnt = 0
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            pinlei = "[[" + spt[2] + "]]"
            if pinlei not in self.pinlei:
                not_cover += 1
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                not_context += 1
                continue
            fout.write(context + "\t" + pinlei + "\n")
            negs = self.neg_sample_random(pinlei, 19)
            for neg in negs:
                fout.write(context + "\t" + neg + "\n")

        fin.close()
        fout.close()
        LogInfo.end_track("Model data prepared. Size: %d. (%d, %d).",
                          cnt - not_context - not_cover, not_cover,
                          not_context)
Esempio n. 16
0
    def _build_graph(self):
        self.query_idx = tf.placeholder(dtype=tf.int32,
                                        shape=[None, self.config.get("max_seq_len")])
        self.query_len = tf.placeholder(dtype=tf.int32,
                                        shape=[None, ])
        self.label = tf.placeholder(dtype=tf.int32,
                                    shape=[None, self.config.get("max_seq_len")])

        self.batch_size = self.config.get("batch_size")

        with tf.device('/cpu:0'), tf.name_scope("embedding_layer"):
            term_embedding = tf.get_variable(
                name="embedding",
                shape=[self.config.get("vocab_size"), self.config.get("embedding_dim")],
                dtype=tf.float32,
                initializer=tf.constant_initializer(self.embedding_vocab)
            )
            self.query_embedding = tf.nn.embedding_lookup(term_embedding, self.query_idx)
            # tf.split:    Tensor -> list tensors
            # tf.stack:    list of tensors -> one tensor
            self.query_slice = [
                tf.squeeze(_input, [1])
                for _input in tf.split(self.query_embedding,
                                       self.config.get("max_seq_len"),
                                       axis=1)
            ]
            # better style: use unstack!  one tensor -> list of tensors
            # equal to the above one
            # self.query_slice = tf.unstack(self.query_embedding, axis=1)

        # bi-LSTM
        with tf.name_scope("rnn_encoder"):
            rnn_config = dict()
            key_list = ["cell_class", "num_units", "dropout_input_keep_prob",
                        "dropout_output_keep_prob", "num_layers"]
            for key in key_list:
                rnn_config[key] = self.config.get(key)
            rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode)
            self.biLstm = rnn_encoder.encode(self.query_slice, self.query_len)

        # output dim = 2 * rnn cell dim (fw + bw)
        self.hidden_dim = self.config.get("num_units") * 2
        self.biLstm_clip = tf.clip_by_value(self.biLstm.attention_values,
                                            -self.config.get("grad_clip"),
                                            self.config.get("grad_clip"))
        # training parameters
        with tf.name_scope("parameters"):
            self.W_l = tf.get_variable(name="W_l",
                                       shape=[self.hidden_dim,
                                              self.config.get("label_num")],
                                       dtype=tf.float32,
                                       initializer
                                       =tf.contrib.layers.xavier_initializer(uniform=True))
            self.b_l = tf.get_variable(name="b_l",
                                       shape=[self.config.get("label_num")],
                                       dtype=tf.float32,
                                       initializer=tf.constant_initializer(0.0))

        # above bi-LSTM
        self.outputs = tf.reshape(tensor=self.biLstm_clip,
                                  shape=[-1, self.hidden_dim])
        self.label_matrix = tf.nn.xw_plus_b(self.outputs, self.W_l, self.b_l)
        # [B, T, label_num]
        self.logits = tf.reshape(tensor=self.label_matrix,
                                 shape=[-1, self.config.get("max_seq_len"),
                                        self.config.get("label_num")])
        # [label_num, label_num]
        self.transition_mat = tf.get_variable(
            "transitions",
            shape=[self.config.get("label_num")+1, self.config.get("label_num")+1],
            initializer=tf.contrib.layers.xavier_initializer(uniform=True))

        # ===================================== Loss ====================================== #
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:

            # # softmax sequence loss for sequence nlu
            # self.loss = softmax_sequence_loss(logits=self.logits,
            #                                   targets=self.label,
            #                                   sequence_length=self.query_len)
            # self.loss = tf.reduce_mean(self.loss)

            # padding logits for crf loss, length += 1
            small = -1000.0
            start_logits = tf.concat(
                [small * tf.ones(shape=[self.batch_size, 1, self.config.get("label_num")]),
                 tf.zeros(shape=[self.batch_size, 1, 1])],
                axis=-1
            )
            LogInfo.logs(start_logits.get_shape().as_list())
            pad_logits = tf.cast(small * tf.ones([self.batch_size,
                                                  self.config.get("max_seq_len"), 1]), tf.float32)
            LogInfo.logs(pad_logits.get_shape().as_list())
            self.logits = tf.concat([self.logits, pad_logits], axis=-1)
            self.logits = tf.concat([start_logits, self.logits], axis=1)
            LogInfo.logs(self.logits.get_shape().as_list())
            targets = tf.concat(
                [tf.cast(self.config.get("label_num")*tf.ones([self.batch_size, 1]),
                         tf.int32),
                 self.label], axis=-1
            )
            LogInfo.logs(targets.get_shape().as_list())

            # CRF layer
            self.log_likelihood, self.transition_mat = \
                tf.contrib.crf.crf_log_likelihood(
                    inputs=self.logits,
                    tag_indices=targets,
                    transition_params=self.transition_mat,
                    sequence_lengths=self.query_len+1)
            self.loss = tf.reduce_mean(-self.log_likelihood)

            # train op
            self.global_step = tf.Variable(0, name="global_step",  trainable=False)
            optimizer = get_optimizer(self.config.get("optimizer"), self.config.get("lr"))
            grads_and_vars = optimizer.compute_gradients(self.loss)
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
Esempio n. 17
0
def word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd, num, setting, norm, ratio, verbose=False):
    acc = 0
    wrong = 0
    for i in range(num, 1000):
        hyp, alt1, alt2 = copa[i]
        ask, labl = label[i]
        if verbose:
            LogInfo.begin_track("step into copa #%d", i+1)
            LogInfo.logs("q: %s", hyp)
            LogInfo.logs("o1: %s", alt1)
            LogInfo.logs("o2: %s", alt2)
            LogInfo.logs("answer: o%d", labl)
        # ask for cause
        if ask == 0:
            if verbose:
                LogInfo.begin_track("[ask for cause] o1/o2 -> q")
            cause, effect = alt1, hyp
            if verbose:
                LogInfo.begin_track("o1->q: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()

            cause, effect = alt2, hyp
            if verbose:
                LogInfo.begin_track("o2->q: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()

            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if verbose:
                LogInfo.end_track()

        # ask for effect
        elif ask == 1:
            if verbose:
                LogInfo.begin_track("[ask for effect] q -> o1/o2")
            cause, effect = hyp, alt1
            if verbose:
                LogInfo.begin_track("q->o1: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()
            cause, effect = hyp, alt2
            if verbose:
                LogInfo.begin_track("q->o2: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()
            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            elif score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            else:
                wrong += 1
                if verbose:
                    LogInfo.logs(">>wrong<<")
            if verbose:
                LogInfo.end_track()
        else:
            print ask
            if verbose:
                LogInfo.logs("[error] ask=%d", ask)

        if verbose:
            LogInfo.end_track("end for #%d", i+1)
            LogInfo.logs("===========")

    if verbose:
        LogInfo.logs("status: %dY-%dW/%d", acc, wrong, 1000-num)
    return acc*1.0/(1000-num)
Esempio n. 18
0
 def load(self, session, fp):
     LogInfo.logs("Loading Model from %s", fp)
     self.saver.restore(session, fp)
     LogInfo.logs("Model loaded from %s", fp)
Esempio n. 19
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync[wordc] = vecc
                self.syne_neg[worde] = vece
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync_neg[wordc] = vecc
                self.syne[worde] = vece
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_lem.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = list()
                option1 = list()
                option2 = list()
                for word in raw_sentence.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        sentence.append(word.split(':')[1])
                for word in raw_option1.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option1.append(word.split(':')[1])
                for word in raw_option2.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option2.append(word.split(':')[1])

                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
Esempio n. 20
0
 def add(self, key, value):
     if key in self.config_dict:
         LogInfo.logs("[warning] key already exists [%s: %s], now change to [%s].",
                      key, str(self.config_dict.get(key)), value)
     self.config_dict[key] = value
Esempio n. 21
0
    def process_query(self):
        LogInfo.begin_track("Begin adding tags for queries...")
        fin = codecs.open(self.root_fp + "/query.txt", 'r', encoding='utf-8')
        fout = codecs.open(self.root_fp + "/query_label.txt",
                           'w',
                           encoding='utf-8')
        cnt = 0
        for line in fin:
            spt = line.strip().split()
            new_line = ""
            context = ""
            label = set()
            i = 0
            while i < len(spt):
                if i + 4 < len(spt):
                    str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i +
                                                                  3] + spt[i +
                                                                           4]
                    if str5 in self.pinlei_set:
                        LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3], spt[i + 4])
                        label.add(str5)
                        new_line += "[[" + str5 + "]] "
                        i += 5
                        continue
                if i + 3 < len(spt):
                    str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3]
                    if str4 in self.pinlei_set:
                        LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3])
                        label.add(str4)
                        new_line += "[[" + str4 + "]] "
                        i += 4
                        continue
                if i + 2 < len(spt):
                    str3 = spt[i] + spt[i + 1] + spt[i + 2]
                    if str3 in self.pinlei_set:
                        LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i],
                                     spt[i + 1], spt[i + 2])
                        label.add(str3)
                        new_line += "[[" + str3 + "]] "
                        i += 3
                        continue
                if i + 1 < len(spt):
                    str2 = spt[i] + spt[i + 1]
                    if str2 in self.pinlei_set:
                        # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                        #              spt[i], spt[i+1])
                        label.add(str2)
                        new_line += "[[" + str2 + "]] "
                        i += 2
                        continue
                if spt[i] in self.pinlei_set:
                    # LogInfo.logs("Found pinlei [%s]", spt[i])
                    label.add(spt[i])
                    new_line += "[[" + spt[i] + "]] "
                    i += 1
                    continue
                context += spt[i] + " "
                new_line += spt[i] + " "
                i += 1

            if len(label) != 0:
                ret = new_line.strip() + "\t" + \
                      context.strip() + "\t" + \
                      "\t".join(label) + "\n"
            else:
                ret = new_line.strip() + "\n"
            fout.write(ret)
            cnt += 1
            if cnt < 5:
                LogInfo.logs("res ==> (%s)", ret.strip())
            LogInfo.show_line(cnt, 100000)
        fin.close()
        fout.close()
        LogInfo.end_track("Query processed.")
Esempio n. 22
0
from xusheng.util.config import ConfigDict
from xusheng.util.data_util import VocabularyLoader
from xusheng.util.log_util import LogInfo

if __name__ == '__main__':
    setting_dir = sys.argv[1]
    try_dir = sys.argv[2]
    root_path = 'runnings/%s/%s' % (setting_dir, try_dir)
    config_path = '%s/param_config' % root_path
    config = ConfigDict(config_path)

    vocab_loader = VocabularyLoader()
    vocab_loader.load_vocab(config.get("vocab_fp"),
                            config.get("embedding_dim"), 'utf-8')
    config.add("vocab_size", vocab_loader.vocab_size)
    LogInfo.logs("Embedding shape: %s.", vocab_loader.vocab_embedding.shape)

    data_loader = DataLoader(config.get("max_seq_len"),
                             vocab_loader.vocab_index_dict)
    data_loader.load(config.get("test_data_fp"), 'utf-8')

    LogInfo.begin_track("Create models...")
    graph = tf.Graph()
    with graph.as_default():
        test_model = IntentionIdentifier(
            config=config,
            mode=tf.contrib.learn.ModeKeys.EVAL,
            embedding_vocab=vocab_loader.vocab_embedding)
        LogInfo.logs("Test model created.")
    LogInfo.end_track()
Esempio n. 23
0
            paths.append(path[1:])
        return paths

    def save(self, session, dir_path):
        import os
        if not(os.path.isdir(dir_path)):
            os.mkdir(dir_path)
        fp = dir_path + "/best_model"
        self.saver.save(session, fp)
        LogInfo.logs("Model saved into %s.", fp)

    def load(self, session, fp):
        LogInfo.logs("Loading Model from %s", fp)
        self.saver.restore(session, fp)
        LogInfo.logs("Model loaded from %s", fp)


if __name__ == "__main__":
    import sys
    import numpy as np
    from xusheng.util.config import ConfigDict
    config = ConfigDict("runnings/%s/%s/param_config"
                        % (sys.argv[1], sys.argv[2]))
    model = NER(config=config,
                mode=tf.contrib.learn.ModeKeys.TRAIN,
                embedding_vocab=np.array([[1, 2]]))
    LogInfo.logs("Model compiled successfully!")


Esempio n. 24
0
    def eval_pair_lambda(self, lamb=1.0, strategy=1):
        """
        evaluation based on word pairs
        :param lamb:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using lambda %.2f and strategy %d...",
            lamb, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            # left
            q_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='o')
            score1a = 0.0
            score2a = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            # right
            q_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='o')
            score1b = 0.0
            score2b = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.logs("[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                     float(cause_correct) / cause, cause_correct, cause,
                     float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
Esempio n. 25
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            show_list1 = list()
            show_list2 = list()
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score1 += tmp
                        show_list1.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score2 += tmp
                        show_list2.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            # LogInfo.logs("[%d] Q: %s", i+1, ' '.join(sentence))
            # LogInfo.logs("[%d] O1: %s", i+1, ' '.join(option1))
            # LogInfo.logs("[%d] O2: %s", i+1, ' '.join(option2))
            # LogInfo.logs("[%d] ask4: [%s].", i+1, ask4)
            #
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list1))
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list2))

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1

                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        if setting == 3:
            LogInfo.logs(
                "[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                float(cause_correct) / cause, cause_correct, cause,
                float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
Esempio n. 26
0
def eval_seq_crf(y_pred_, y_true_, method='precision'):
    """
    Evaluation for sequence labeling, without "Outside"
    under specific conditions (3-class)
    :param y_pred_: [B, T, ]
    :param y_true_: [B, T, ]
    :param method: precision/ recall
    :return: f1 score
    """
    # LogInfo.logs("y_pred: %s", '\n'.join([str(x) for x in y_pred_]))
    # LogInfo.logs("y_true: %s", '\n'.join([str(x) for x in y_true_]))

    tag_dict = ['O', 'PL_B', 'PL_I', 'PK_B', 'PK_I', 'PV_B', 'PV_I']
    LogInfo.begin_track("Eval seq %s...", method)
    if method == 'precision':
        y_pred = np.array(y_pred_)
        y_true = np.array(y_true_)
    elif method == 'recall':
        y_pred = np.array(y_true_)
        y_true = np.array(y_pred_)

    correct = {'PL': 0, 'PK': 0, 'PV': 0}
    act_cnt = {'PL': 0, 'PK': 0, 'PV': 0}
    for line_pred, line_true in zip(y_pred, y_true):
        i = 0
        cnt = len(line_pred)
        while i < cnt:
            tag_num = line_pred[i]
            tag = tag_dict[tag_num]
            if tag == 'O':
                i += 1
                continue
            else:
                kind = tag[:2]
                sign = tag[3]
            if sign == 'B':
                j = i + 1
                while j < cnt:
                    next_tag = tag_dict[line_pred[j]]
                    if next_tag[:2] == kind and next_tag[3] == 'I':
                        j += 1
                    else:
                        break
            else:
                i += 1
                continue

            act_cnt[kind] += 1

            act_label = ' '.join([str(x) for x in line_true[i:j]])
            proposed_label = ' '.join([str(x) for x in line_pred[i:j]])
            if act_label == proposed_label and (
                    j == cnt or line_true[j] != line_true[i] + 1):
                correct[kind] += 1
            i = j

    ret = dict()
    keys = act_cnt.keys()
    correct_total = 0
    cnt_total = 0
    for key in keys:
        if act_cnt[key] == 0:
            ret[key] = 0.0
        else:
            ret[key] = correct[key] * 1.0 / act_cnt[key]
        LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key],
                     act_cnt[key])
        correct_total += correct[key]
        cnt_total += act_cnt[key]
        if cnt_total == 0:
            overall = 0.0
        else:
            overall = correct_total * 1.0 / cnt_total
    LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total,
                 cnt_total)
    LogInfo.end_track()
    return overall
Esempio n. 27
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync[wordc] = vecc
                    self.syne_neg[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync_neg[wordc] = vecc
                    self.syne[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_phr.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = map(lambda x: x.split(':')[1],
                               raw_sentence.strip().split())
                option1 = map(lambda x: x.split(':')[1],
                              raw_option1.strip().split())
                option2 = map(lambda x: x.split(':')[1],
                              raw_option2.strip().split())
                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
Esempio n. 28
0
from xusheng.util.config import ConfigDict
from xusheng.util.data_util import VocabularyLoader
from xusheng.util.eval_util import eval_seq_crf
from xusheng.util.log_util import LogInfo

if __name__ == '__main__':
    setting_dir = sys.argv[1]
    try_dir = sys.argv[2]
    root_path = 'runnings/%s/%s' % (setting_dir, try_dir)
    config_path = '%s/param_config' % root_path
    config = ConfigDict(config_path)

    vocab_loader = VocabularyLoader()
    vocab_loader.load_vocab(config.get("vocab_fp"), config.get("embedding_dim"), 'utf-8')
    config.add("vocab_size", vocab_loader.vocab_size)
    LogInfo.logs("Embedding shape: %s.", vocab_loader.vocab_embedding.shape)

    data_loader = DataLoader(config.get("max_seq_len"), vocab_loader.vocab_index_dict)
    data_loader.load(config.get("data_fp"), 'utf-8')

    LogInfo.logs("Create train, valid, test split...")
    train_size = int(config.get("train_split") * data_loader.data_size)
    valid_size = int(config.get("valid_split") * data_loader.data_size)
    test_size = data_loader.data_size - train_size - valid_size

    train_data = data_loader.data[:train_size]

    query_idx_v, query_len_v, label_v, _, _, _ = \
        zip(*data_loader.data[train_size:train_size+valid_size])

    query_idx_t, query_len_t, label_t, _, _, _ = \
Esempio n. 29
0
 def get(self, key):
     if key not in self.config_dict:
         LogInfo.logs("[warning] key [%s] not exists.")
     return self.config_dict.get(key, None)
Esempio n. 30
0
from xusheng.util.data_util import VocabularyLoader
from xusheng.util.eval_util import eval_acc_pn
from xusheng.util.log_util import LogInfo

if __name__ == '__main__':
    setting_dir = sys.argv[1]
    try_dir = sys.argv[2]
    root_path = 'runnings/%s/%s' % (setting_dir, try_dir)
    config_path = '%s/param_config' % root_path
    config = ConfigDict(config_path)

    vocab_loader = VocabularyLoader()
    vocab_loader.load_vocab(config.get("vocab_fp"),
                            config.get("embedding_dim"), 'utf-8')
    config.add("vocab_size", vocab_loader.vocab_size)
    LogInfo.logs("Embedding shape: %s.", vocab_loader.vocab_embedding.shape)

    data_loader = DataLoader(config.get("max_seq_len"),
                             vocab_loader.vocab_index_dict)
    data_loader.load(config.get("data_fp"), 'utf-8')

    LogInfo.logs("Create train, valid, test split...")
    train_size = int(0.997 * data_loader.data_size)
    valid_size = int(0.001 * data_loader.data_size)
    test_size = data_loader.data_size - train_size - valid_size

    train_data = data_loader.data[:train_size]

    valid_data = data_loader.data[train_size:train_size + valid_size]
    context_idx_valid, context_seq_valid, pinlei_idx_valid = zip(*valid_data)
    context_idx_valid = np.array(context_idx_valid)