コード例 #1
0
ファイル: identifier.py プロジェクト: lkq1992yeah/CompQA
 def save(self, session, dir_path):
     import os
     if not (os.path.isdir(dir_path)):
         os.mkdir(dir_path)
     fp = dir_path + "/best_model"
     self.saver.save(session, fp)
     LogInfo.logs("Model saved into %s.", fp)
コード例 #2
0
ファイル: data_util.py プロジェクト: lkq1992yeah/CompQA
def load_kkv_table(file_path):
    kkv_table = dict()
    with codecs.open(file_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            spt = line.strip().split()
            if len(spt) < 3:
                LogInfo.logs("[error] bad line: %s", line.strip())
            kkv_table[spt[0] + ' ' + spt[1]] = spt[2]
    return kkv_table
コード例 #3
0
ファイル: data_util.py プロジェクト: lkq1992yeah/CompQA
    def create_batches(self):
        if self.data_size % self.batch_size == 0:
            self.num_batches = int(self.data_size / self.batch_size)
        else:
            self.num_batches = int(self.data_size / self.batch_size) + 1

        # When the data (tensor) is too small, let's give them a better error message
        if self.num_batches == 0:
            assert False, "Not enough data. Make seq_length and batch_size small."
        LogInfo.logs("Batches created. (%d)", self.num_batches)
コード例 #4
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
 def load_pinlei(self):
     LogInfo.begin_track("Load pinlei names...")
     with codecs.open("/u01/xusheng/word2vec/vec/yyh_pinlei.txt",
                      'r',
                      encoding='utf-8') as fin:
         for line in fin:
             name = line.strip().split()[0]
             if name.startswith("[["):
                 self.pinlei.add(name)
     LogInfo.end_track("Pinlei name loaded. Size: %d.", len(self.pinlei))
コード例 #5
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
 def add_pinlei_tag_yyh(self):
     LogInfo.begin_track("Begin adding tags for pinleis...")
     fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt",
                       'r',
                       encoding='utf-8')
     fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag",
                        'w',
                        encoding='utf-8')
     cnt = 0
     for line in fin:
         spt = line.strip().split()
         new_line = ""
         i = 0
         while i < len(spt):
             if i + 3 < len(spt):
                 str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3]
                 if str4 in self.pinlei_set:
                     LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                  spt[i], spt[i + 1], spt[i + 2],
                                  spt[i + 3])
                     new_line += "[[" + str4 + "]] "
                     i += 4
                     continue
             if i + 2 < len(spt):
                 str3 = spt[i] + spt[i + 1] + spt[i + 2]
                 if str3 in self.pinlei_set:
                     # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]",
                     #              spt[i], spt[i+1], spt[i+2])
                     new_line += "[[" + str3 + "]] "
                     i += 3
                     continue
             if i + 1 < len(spt):
                 str2 = spt[i] + spt[i + 1]
                 if str2 in self.pinlei_set:
                     # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                     #              spt[i], spt[i+1])
                     new_line += "[[" + str2 + "]] "
                     i += 2
                     continue
             if spt[i] in self.pinlei_set:
                 # LogInfo.logs("Found pinlei [%s]", spt[i])
                 new_line += "[[" + spt[i] + "]] "
                 i += 1
                 continue
             new_line += spt[i] + " "
             i += 1
         fout.write(new_line + "\n")
         cnt += 1
         if cnt < 5:
             LogInfo.logs("res ==> (%s)", new_line)
         LogInfo.show_line(cnt, 100000)
     fin.close()
     fout.close()
     LogInfo.end_track("Pinlei tags added.")
コード例 #6
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
 def load_pinlei(self):
     LogInfo.begin_track("Load pinlei names...")
     with codecs.open(self.root_fp + "/raw/kg_pinlei_id",
                      'r',
                      encoding='utf-8') as fin:
         for line in fin:
             spt = line.strip().split("\t")
             if len(spt) < 2:
                 continue
             pinlei = spt[0]
             self.pinlei_set.add(pinlei)
     LogInfo.end_track("%d names loaded.", len(self.pinlei_set))
コード例 #7
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
    def tag_pinlei(self, query):
        LogInfo.logs("Tagging pinlei for your query...")
        spt = query.strip().split()
        new_line = ""
        context = ""
        label = set()
        i = 0
        while i < len(spt):
            if i + 4 < len(spt):
                str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] + spt[i +
                                                                           4]
                if "[[" + str5 + "]]" in self.pinlei:
                    LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]",
                                 spt[i], spt[i + 1], spt[i + 2], spt[i + 3],
                                 spt[i + 4])
                    label.add("[[" + str5 + "]]")
                    new_line += "[[" + str5 + "]] "
                    i += 5
                    continue
            if i + 3 < len(spt):
                str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3]
                if "[[" + str4 + "]]" in self.pinlei:
                    LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i],
                                 spt[i + 1], spt[i + 2], spt[i + 3])
                    label.add("[[" + str4 + "]]")
                    new_line += "[[" + str4 + "]] "
                    i += 4
                    continue
            if i + 2 < len(spt):
                str3 = spt[i] + spt[i + 1] + spt[i + 2]
                if "[[" + str3 + "]]" in self.pinlei:
                    LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i],
                                 spt[i + 1], spt[i + 2])
                    label.add("[[" + str3 + "]]")
                    new_line += "[[" + str3 + "]] "
                    i += 3
                    continue
            if i + 1 < len(spt):
                str2 = spt[i] + spt[i + 1]
                if "[[" + str2 + "]]" in self.pinlei:
                    # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                    #              spt[i], spt[i+1])
                    label.add("[[" + str2 + "]]")
                    new_line += "[[" + str2 + "]] "
                    i += 2
                    continue
            if "[[" + spt[i] + "]]" in self.pinlei:
                # LogInfo.logs("Found pinlei [%s]", spt[i])
                label.add("[[" + spt[i] + "]]")
                new_line += "[[" + spt[i] + "]] "
                i += 1
                continue
            context += spt[i] + " "
            new_line += spt[i] + " "
            i += 1

        return new_line.strip(), context.strip(), list(label)
コード例 #8
0
ファイル: copa_eva.py プロジェクト: lkq1992yeah/CompQA
def cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose=False):
    score = 0
    num = 0
    rcause = []
    reffect = []
    for word in cause:
        if word in cdic and word in cnegdic:
            rcause.append(word)
    for word in effect:
        if word in edic and word in enegdic:
            reffect.append(word)

    sort_map = dict()
    for wordc in rcause:
        for worde in reffect:
            if wordc == worde:
               continue
            score_suf = get_similar(cdic[wordc], enegdic[worde], norm)
            score_nec = get_similar(cnegdic[wordc], enegdic[worde], norm)
            tmp = lamd * score_suf + (1-lamd) * score_nec
            # check reverse
            score_reverse = get_similar(cdic[worde], enegdic[wordc], norm)
            if abs(score_suf-score_reverse) / min(abs(score_suf), abs(score_reverse)) < ratio:
                continue
            score += tmp
            num += 1
            tmp_str = "[%s]-[%s] ==> %.1f*[%.2f]+%.1f*[%.2f]=[%.4f]" % \
                      (wordc, worde, lamd, score_suf, 1-lamd, score_nec, tmp)
            sort_map[tmp] = tmp_str

    if verbose:
        for line in [sort_map[k] for k in sorted(sort_map.keys(), reverse=True)]:
            LogInfo.logs(line)

    if setting == 1:
        return score
    elif setting == 2:
        if verbose:
            LogInfo.logs("%.4f / (%d+%d=%d) = %.4f", score,
                         len(rcause), len(reffect), len(rcause)+len(reffect),
                         score / (len(rcause) + len(reffect)))
        return score/(len(rcause)+len(reffect))
    elif setting == 3:
        return score/(len(rcause)*len(reffect))
    elif num == 0:
        if verbose:
            LogInfo.logs("%.4f / %d = %.4f", score, 0, 0.0)
        return 0.0
    else:
        if verbose:
            LogInfo.logs("%.4f / %d = %.4f", score, num, score/num)
        return score/num
コード例 #9
0
    def eval_avg(self, setting=1):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using setting %d...",
            setting)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, setting, 'q')
            opt1_vec = self.get_repr(option1, ask4, setting, 'o')
            opt2_vec = self.get_repr(option2, ask4, setting, 'o')
            score1 = self.get_similarity(sent_vec, opt1_vec)
            score2 = self.get_similarity(sent_vec, opt2_vec)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
コード例 #10
0
ファイル: data.py プロジェクト: lkq1992yeah/CompQA
def fuzzy_match_name(mention, vocab, PN):
    """
    :param mention: list of strings
    :param vocab: list of (string, set) tuple
    :param PN: number of candidates = PN-1
    :return: list of strings with size PN-1
    """
    m_set = set()
    for ch in mention:
        m_set.add(ch)
    # LogInfo.begin_track("generate for %s [%s]...", mention, m_set)
    rank_list = TopKRankedList(PN - 1)
    for name, c_set in vocab.items():
        score = get_jaccard_score(m_set, c_set)
        # LogInfo.logs("%s [%s] : %.4f", name, c_set, score)
        if score == 1.0:
            continue
        rank_list.push((name, score))
    LogInfo.logs("Cands for %s: [%s]", mention,
                 "|".join(rank_list.top_names()))
    # LogInfo.end_track()
    return rank_list.top_names()
コード例 #11
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on ROC using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        for i in range(0, 1871):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 1871, correct, 1871)
        LogInfo.end_track()
コード例 #12
0
ファイル: config.py プロジェクト: lkq1992yeah/CompQA
def load_configs(fp):
    LogInfo.begin_track('Loading config from %s: ', fp)
    config_dict = {}
    with open(fp, 'r') as br:
        for line in br.readlines():
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            if line.find('\t') == -1:
                continue
            spt = line.split('\t')
            if len(spt) < 3:
                LogInfo.logs("[%s] is invalid, pls add type!", line)
                continue
            k = spt[0]
            v_str = spt[1]
            t = spt[2]
            if t == "d" or t == "int":
                config_dict[k] = int(v_str)
            elif t == "f" or t == "float" or t == "double":
                config_dict[k] = float(v_str)
            elif t == "b" or t == "bool":
                if v_str == "true" or v_str == "True" \
                        or v_str == "TRUE" or v_str == "1":
                    config_dict[k] = True
                else:
                    config_dict[k] = False
            elif t == "tf" or t == "tensorflow":
                if v_str == 'relu':
                    config_dict[k] = tf.nn.relu
                elif v_str == 'sigmoid':
                    config_dict[k] = tf.nn.sigmoid
                elif v_str == 'tanh':
                    config_dict[k] = tf.nn.tanh
            elif t == "None" or v_str == "None":
                config_dict[k] = None
            else:
                config_dict[k] = v_str
            LogInfo.logs('%s = %s', k, v_str)

    LogInfo.end_track()
    return config_dict
コード例 #13
0
ファイル: data_util.py プロジェクト: lkq1992yeah/CompQA
    def load_vocab_name(self, vocab_file, encoding):
        LogInfo.begin_track("Loading vocab from %s...", vocab_file)
        self.vocab_size = 0
        self.index_vocab_dict.clear()
        self.vocab_index_dict.clear()
        with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
            index = 0
            for line in fin:
                self.vocab_index_dict[line.strip()] = index
                self.index_vocab_dict.append(line.strip())
                index += 1
                LogInfo.show_line(index, 50000)

        self.vocab_size = index
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
コード例 #14
0
ファイル: data.py プロジェクト: lkq1992yeah/CompQA
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     context_idxs, context_seqs, pinlei_idxs = list(), list(), list()
     cnt = 0
     with codecs.open(data_file, 'r', encoding=encoding) as fin:
         for line in fin:
             context_idx, context_seq, pinlei_idx = self.decode_line(line)
             context_idxs.append(context_idx)
             context_seqs.append(context_seq)
             pinlei_idxs.append(pinlei_idx)
             cnt += 1
             LogInfo.show_line(cnt, 10000)
     self.data = list(zip(context_idxs, context_seqs, pinlei_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track()
コード例 #15
0
ファイル: data_util.py プロジェクト: lkq1992yeah/CompQA
    def load_vocab_embedding(self, embedding_file, encoding):
        LogInfo.begin_track("Loading embeddings from %s...", embedding_file)
        vocab_embedding = len(self.vocab_index_dict) * [None]
        with codecs.open(embedding_file, 'r', encoding=encoding) as fin:
            count = 0
            for line in fin:
                strs = line.split()
                embedding = [float(strs[i].strip()) for i in range(1, len(strs))]
                vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding
                count += 1
                LogInfo.show_line(count, 50000)

        assert count == len(vocab_embedding)
        self.vocab_embedding = np.asarray(vocab_embedding)
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
コード例 #16
0
ファイル: copa_eva.py プロジェクト: lkq1992yeah/CompQA
def main():
    copa, worddic = readcopa()
    label = readlabel()
    cdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt")
    enegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt")
    cnegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt")
    edic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt")

    verbose = False

    import sys
    mode = sys.argv[1]
    if mode == 'full':
        for ratio in range(21):
            for lamd in range(11):
                acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, 4, True, ratio*0.1, verbose)
                print ratio*0.1, lamd*0.1, acc
        # print "word pair with norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, True, verbose)
        #         print lamd*0.1, setting, acc

        # print "\nword pair without norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, False, verbose)
        #         print lamd*0.1, setting, acc
        #
        # print "\nsentence level with norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, True)
        #     print lamd*0.1,  acc
        #
        # print "\nsentence level without norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, False)
        #     print lamd*0.1, acc
    elif mode == 'case':
        para1 = float(sys.argv[2])
        para2 = int(sys.argv[3])
        LogInfo.begin_track("case tracing for word-pair & lambda=%.1f, setting=%d:", para1, para2)
        verbose = True
        acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, para1, 500, para2, True, verbose)
        LogInfo.logs("[Accuracy] %.4f", acc)
        LogInfo.end_track()
コード例 #17
0
    def eval_avg_lambda(self, lamb=1.0):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using lambda %.2f...",
            lamb)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, 1, 'q')
            opt1_vec = self.get_repr(option1, ask4, 1, 'o')
            opt2_vec = self.get_repr(option2, ask4, 1, 'o')
            score1a = self.get_similarity(sent_vec, opt1_vec)
            score2a = self.get_similarity(sent_vec, opt2_vec)

            sent_vec = self.get_repr(sentence, ask4, 2, 'q')
            opt1_vec = self.get_repr(option1, ask4, 2, 'o')
            opt2_vec = self.get_repr(option2, ask4, 2, 'o')
            score1b = self.get_similarity(sent_vec, opt1_vec)
            score2b = self.get_similarity(sent_vec, opt2_vec)

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            # LogInfo.logs("[log] %.4f(%.2f^%.2f*%.2f^%.2f) ||| %.4f(%.2f^%.2f*%.2f^%.2f)",
            #              score1, score1a, lamb, score1b, 1-lamb,
            #              score2, score2a, lamb, score2b, 1-lamb)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
コード例 #18
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
    def prepare_model_data(self, pinlei_num):
        self.pinlei_num = pinlei_num
        LogInfo.begin_track("Generate Multi-Pinlei Data for evaluation...")
        fin = codecs.open(self.root_fp + "/query_label.txt." +
                          str(self.pinlei_num),
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".name",
                           'w',
                           encoding='utf-8')
        fsho = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".check",
                           'w',
                           encoding='utf-8')
        cnt = 0
        not_cover = set()
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            is_cover = True
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                if pinlei not in self.pinlei:
                    # LogInfo.logs("%s not cover.", pinlei)
                    is_cover = False
                    not_cover.add(pinlei)
            if not is_cover:
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                continue
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                fout.write(context + "\t" + pinlei + "\n")
                fsho.write(spt[0] + "\n")

        fin.close()
        fout.close()
        fsho.close()
        LogInfo.end_track("%d pinlei not cover.", len(not_cover))
コード例 #19
0
ファイル: misc.py プロジェクト: lkq1992yeah/CompQA
    def prepare_model_data(self):
        LogInfo.begin_track("Generate model data...")
        # .1 means single pinlei
        fin = codecs.open(self.root_fp + "/query_label.txt.1",
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_train.name",
                           'w',
                           encoding='utf-8')
        not_cover = 0
        not_context = 0
        cnt = 0
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            pinlei = "[[" + spt[2] + "]]"
            if pinlei not in self.pinlei:
                not_cover += 1
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                not_context += 1
                continue
            fout.write(context + "\t" + pinlei + "\n")
            negs = self.neg_sample_random(pinlei, 19)
            for neg in negs:
                fout.write(context + "\t" + neg + "\n")

        fin.close()
        fout.close()
        LogInfo.end_track("Model data prepared. Size: %d. (%d, %d).",
                          cnt - not_context - not_cover, not_cover,
                          not_context)
コード例 #20
0
ファイル: data_util.py プロジェクト: lkq1992yeah/CompQA
 def load_vocab(self, vocab_file, embedding_dim, encoding):
     LogInfo.begin_track("Loading vocab from %s...", vocab_file)
     self.vocab_size = 0
     self.index_vocab_dict.clear()
     self.vocab_index_dict.clear()
     self.vocab_embedding.clear()
     with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
         index = 0
         # 0 embedding for not-found query term
         self.vocab_index_dict["[[NULL]]"] = index
         self.index_vocab_dict.append("[[NULL]]")
         self.vocab_embedding.append([0.0 for _ in range(embedding_dim)])
         index += 1
         for line in fin:
             spt = line.strip().split()
             self.vocab_index_dict[spt[0]] = index
             self.index_vocab_dict.append(spt[0])
             embedding = [float(spt[i].strip()) for i in range(1, len(spt))]
             self.vocab_embedding.append(embedding)
             index += 1
             LogInfo.show_line(index, 50000)
     self.vocab_size = len(self.vocab_embedding)
     self.vocab_embedding = np.array(self.vocab_embedding)
     LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
コード例 #21
0
ファイル: identifier.py プロジェクト: lkq1992yeah/CompQA
 def load(self, session, fp):
     LogInfo.logs("Loading Model from %s", fp)
     self.saver.restore(session, fp)
     LogInfo.logs("Model loaded from %s", fp)
コード例 #22
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            show_list1 = list()
            show_list2 = list()
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score1 += tmp
                        show_list1.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score2 += tmp
                        show_list2.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            # LogInfo.logs("[%d] Q: %s", i+1, ' '.join(sentence))
            # LogInfo.logs("[%d] O1: %s", i+1, ' '.join(option1))
            # LogInfo.logs("[%d] O2: %s", i+1, ' '.join(option2))
            # LogInfo.logs("[%d] ask4: [%s].", i+1, ask4)
            #
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list1))
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list2))

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1

                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        if setting == 3:
            LogInfo.logs(
                "[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                float(cause_correct) / cause, cause_correct, cause,
                float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
コード例 #23
0
ファイル: copa_eva.py プロジェクト: lkq1992yeah/CompQA
def word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd, num, setting, norm, ratio, verbose=False):
    acc = 0
    wrong = 0
    for i in range(num, 1000):
        hyp, alt1, alt2 = copa[i]
        ask, labl = label[i]
        if verbose:
            LogInfo.begin_track("step into copa #%d", i+1)
            LogInfo.logs("q: %s", hyp)
            LogInfo.logs("o1: %s", alt1)
            LogInfo.logs("o2: %s", alt2)
            LogInfo.logs("answer: o%d", labl)
        # ask for cause
        if ask == 0:
            if verbose:
                LogInfo.begin_track("[ask for cause] o1/o2 -> q")
            cause, effect = alt1, hyp
            if verbose:
                LogInfo.begin_track("o1->q: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()

            cause, effect = alt2, hyp
            if verbose:
                LogInfo.begin_track("o2->q: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()

            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if verbose:
                LogInfo.end_track()

        # ask for effect
        elif ask == 1:
            if verbose:
                LogInfo.begin_track("[ask for effect] q -> o1/o2")
            cause, effect = hyp, alt1
            if verbose:
                LogInfo.begin_track("q->o1: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()
            cause, effect = hyp, alt2
            if verbose:
                LogInfo.begin_track("q->o2: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()
            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            elif score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            else:
                wrong += 1
                if verbose:
                    LogInfo.logs(">>wrong<<")
            if verbose:
                LogInfo.end_track()
        else:
            print ask
            if verbose:
                LogInfo.logs("[error] ask=%d", ask)

        if verbose:
            LogInfo.end_track("end for #%d", i+1)
            LogInfo.logs("===========")

    if verbose:
        LogInfo.logs("status: %dY-%dW/%d", acc, wrong, 1000-num)
    return acc*1.0/(1000-num)
コード例 #24
0
    def eval_pair_lambda(self, lamb=1.0, strategy=1):
        """
        evaluation based on word pairs
        :param lamb:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using lambda %.2f and strategy %d...",
            lamb, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            # left
            q_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='o')
            score1a = 0.0
            score2a = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            # right
            q_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='o')
            score1b = 0.0
            score2b = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.logs("[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                     float(cause_correct) / cause, cause_correct, cause,
                     float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
コード例 #25
0
ファイル: config.py プロジェクト: lkq1992yeah/CompQA
 def get(self, key):
     if key not in self.config_dict:
         LogInfo.logs("[warning] key [%s] not exists.")
     return self.config_dict.get(key, None)
コード例 #26
0
ファイル: config.py プロジェクト: lkq1992yeah/CompQA
 def add(self, key, value):
     if key in self.config_dict:
         LogInfo.logs("[warning] key already exists [%s: %s], now change to [%s].",
                      key, str(self.config_dict.get(key)), value)
     self.config_dict[key] = value
コード例 #27
0
ファイル: copa_eval_pos.py プロジェクト: lkq1992yeah/CompQA
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync[wordc] = vecc
                self.syne_neg[worde] = vece
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync_neg[wordc] = vecc
                self.syne[worde] = vece
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_lem.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = list()
                option1 = list()
                option2 = list()
                for word in raw_sentence.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        sentence.append(word.split(':')[1])
                for word in raw_option1.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option1.append(word.split(':')[1])
                for word in raw_option2.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option2.append(word.split(':')[1])

                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
コード例 #28
0
ファイル: data.py プロジェクト: lkq1992yeah/CompQA
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     if os.path.isfile(data_file):
         LogInfo.begin_track("[Exist] Loading from %s...", data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         with codecs.open(data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 spt = line.strip().split("\t")
                 query_idxs.append([int(idx) for idx in spt[0].split(" ")])
                 query_lens.append(int(spt[1]))
                 labels.append([int(idx) for idx in spt[2].split(" ")])
                 intents.append(int(spt[3]))
                 link_masks.append([int(idx) for idx in spt[4].split(" ")])
                 entity_idxs.append([int(idx) for idx in spt[5].split(" ")])
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len)
     else:
         txt_data_file = data_file + ".name"
         LogInfo.begin_track("[Not Exist] Loading from %s...",
                             txt_data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         fout = codecs.open(data_file, 'w', encoding=encoding)
         with codecs.open(txt_data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 query_idx, query_len, label, intent, link_mask, entity_idx\
                     = self.decode_line(line)
                 fout.write(" ".join([str(x) for x in query_idx]) + "\t" +
                            str(query_len) + "\t" +
                            " ".join([str(x) for x in label]) + "\t" +
                            str(intent) + "\t" +
                            " ".join([str(x) for x in link_mask]) + "\t" +
                            " ".join([str(x) for x in entity_idx]) + "\n")
                 query_idxs.append(query_idx)
                 query_lens.append(query_len)
                 labels.append(label)
                 intents.append(intent)
                 link_masks.append(link_mask)
                 entity_idxs.append(entity_idx)
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         fout.close()
         LogInfo.logs("Write into %s.", data_file)
         LogInfo.end_track("Max_seq_len = %d.", self.max)
     self.data = list(
         zip(query_idxs, query_lens, labels, intents, link_masks,
             entity_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track("Loaded. Size: %d.", self.data_size)
コード例 #29
0
ファイル: single_task.py プロジェクト: lkq1992yeah/CompQA
    def _build_graph(self):
        self.query_idx = tf.placeholder(dtype=tf.int32,
                                        shape=[None, self.config.get("max_seq_len")])
        self.query_len = tf.placeholder(dtype=tf.int32,
                                        shape=[None, ])
        self.label = tf.placeholder(dtype=tf.int32,
                                    shape=[None, self.config.get("max_seq_len")])

        self.batch_size = self.config.get("batch_size")

        with tf.device('/cpu:0'), tf.name_scope("embedding_layer"):
            term_embedding = tf.get_variable(
                name="embedding",
                shape=[self.config.get("vocab_size"), self.config.get("embedding_dim")],
                dtype=tf.float32,
                initializer=tf.constant_initializer(self.embedding_vocab)
            )
            self.query_embedding = tf.nn.embedding_lookup(term_embedding, self.query_idx)
            # tf.split:    Tensor -> list tensors
            # tf.stack:    list of tensors -> one tensor
            self.query_slice = [
                tf.squeeze(_input, [1])
                for _input in tf.split(self.query_embedding,
                                       self.config.get("max_seq_len"),
                                       axis=1)
            ]
            # better style: use unstack!  one tensor -> list of tensors
            # equal to the above one
            # self.query_slice = tf.unstack(self.query_embedding, axis=1)

        # bi-LSTM
        with tf.name_scope("rnn_encoder"):
            rnn_config = dict()
            key_list = ["cell_class", "num_units", "dropout_input_keep_prob",
                        "dropout_output_keep_prob", "num_layers"]
            for key in key_list:
                rnn_config[key] = self.config.get(key)
            rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode)
            self.biLstm = rnn_encoder.encode(self.query_slice, self.query_len)

        # output dim = 2 * rnn cell dim (fw + bw)
        self.hidden_dim = self.config.get("num_units") * 2
        self.biLstm_clip = tf.clip_by_value(self.biLstm.attention_values,
                                            -self.config.get("grad_clip"),
                                            self.config.get("grad_clip"))
        # training parameters
        with tf.name_scope("parameters"):
            self.W_l = tf.get_variable(name="W_l",
                                       shape=[self.hidden_dim,
                                              self.config.get("label_num")],
                                       dtype=tf.float32,
                                       initializer
                                       =tf.contrib.layers.xavier_initializer(uniform=True))
            self.b_l = tf.get_variable(name="b_l",
                                       shape=[self.config.get("label_num")],
                                       dtype=tf.float32,
                                       initializer=tf.constant_initializer(0.0))

        # above bi-LSTM
        self.outputs = tf.reshape(tensor=self.biLstm_clip,
                                  shape=[-1, self.hidden_dim])
        self.label_matrix = tf.nn.xw_plus_b(self.outputs, self.W_l, self.b_l)
        # [B, T, label_num]
        self.logits = tf.reshape(tensor=self.label_matrix,
                                 shape=[-1, self.config.get("max_seq_len"),
                                        self.config.get("label_num")])
        # [label_num, label_num]
        self.transition_mat = tf.get_variable(
            "transitions",
            shape=[self.config.get("label_num")+1, self.config.get("label_num")+1],
            initializer=tf.contrib.layers.xavier_initializer(uniform=True))

        # ===================================== Loss ====================================== #
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:

            # # softmax sequence loss for sequence nlu
            # self.loss = softmax_sequence_loss(logits=self.logits,
            #                                   targets=self.label,
            #                                   sequence_length=self.query_len)
            # self.loss = tf.reduce_mean(self.loss)

            # padding logits for crf loss, length += 1
            small = -1000.0
            start_logits = tf.concat(
                [small * tf.ones(shape=[self.batch_size, 1, self.config.get("label_num")]),
                 tf.zeros(shape=[self.batch_size, 1, 1])],
                axis=-1
            )
            LogInfo.logs(start_logits.get_shape().as_list())
            pad_logits = tf.cast(small * tf.ones([self.batch_size,
                                                  self.config.get("max_seq_len"), 1]), tf.float32)
            LogInfo.logs(pad_logits.get_shape().as_list())
            self.logits = tf.concat([self.logits, pad_logits], axis=-1)
            self.logits = tf.concat([start_logits, self.logits], axis=1)
            LogInfo.logs(self.logits.get_shape().as_list())
            targets = tf.concat(
                [tf.cast(self.config.get("label_num")*tf.ones([self.batch_size, 1]),
                         tf.int32),
                 self.label], axis=-1
            )
            LogInfo.logs(targets.get_shape().as_list())

            # CRF layer
            self.log_likelihood, self.transition_mat = \
                tf.contrib.crf.crf_log_likelihood(
                    inputs=self.logits,
                    tag_indices=targets,
                    transition_params=self.transition_mat,
                    sequence_lengths=self.query_len+1)
            self.loss = tf.reduce_mean(-self.log_likelihood)

            # train op
            self.global_step = tf.Variable(0, name="global_step",  trainable=False)
            optimizer = get_optimizer(self.config.get("optimizer"), self.config.get("lr"))
            grads_and_vars = optimizer.compute_gradients(self.loss)
            self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
コード例 #30
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync[wordc] = vecc
                    self.syne_neg[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync_neg[wordc] = vecc
                    self.syne[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_phr.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = map(lambda x: x.split(':')[1],
                               raw_sentence.strip().split())
                option1 = map(lambda x: x.split(':')[1],
                              raw_option1.strip().split())
                option2 = map(lambda x: x.split(':')[1],
                              raw_option2.strip().split())
                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()