Beispiel #1
0
    def eval_avg(self, setting=1):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using setting %d...",
            setting)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, setting, 'q')
            opt1_vec = self.get_repr(option1, ask4, setting, 'o')
            opt2_vec = self.get_repr(option2, ask4, setting, 'o')
            score1 = self.get_similarity(sent_vec, opt1_vec)
            score2 = self.get_similarity(sent_vec, opt2_vec)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
Beispiel #2
0
 def load_pinlei(self):
     LogInfo.begin_track("Load pinlei names...")
     with codecs.open("/u01/xusheng/word2vec/vec/yyh_pinlei.txt",
                      'r',
                      encoding='utf-8') as fin:
         for line in fin:
             name = line.strip().split()[0]
             if name.startswith("[["):
                 self.pinlei.add(name)
     LogInfo.end_track("Pinlei name loaded. Size: %d.", len(self.pinlei))
Beispiel #3
0
 def add_pinlei_tag_yyh(self):
     LogInfo.begin_track("Begin adding tags for pinleis...")
     fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt",
                       'r',
                       encoding='utf-8')
     fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag",
                        'w',
                        encoding='utf-8')
     cnt = 0
     for line in fin:
         spt = line.strip().split()
         new_line = ""
         i = 0
         while i < len(spt):
             if i + 3 < len(spt):
                 str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3]
                 if str4 in self.pinlei_set:
                     LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                  spt[i], spt[i + 1], spt[i + 2],
                                  spt[i + 3])
                     new_line += "[[" + str4 + "]] "
                     i += 4
                     continue
             if i + 2 < len(spt):
                 str3 = spt[i] + spt[i + 1] + spt[i + 2]
                 if str3 in self.pinlei_set:
                     # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]",
                     #              spt[i], spt[i+1], spt[i+2])
                     new_line += "[[" + str3 + "]] "
                     i += 3
                     continue
             if i + 1 < len(spt):
                 str2 = spt[i] + spt[i + 1]
                 if str2 in self.pinlei_set:
                     # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                     #              spt[i], spt[i+1])
                     new_line += "[[" + str2 + "]] "
                     i += 2
                     continue
             if spt[i] in self.pinlei_set:
                 # LogInfo.logs("Found pinlei [%s]", spt[i])
                 new_line += "[[" + spt[i] + "]] "
                 i += 1
                 continue
             new_line += spt[i] + " "
             i += 1
         fout.write(new_line + "\n")
         cnt += 1
         if cnt < 5:
             LogInfo.logs("res ==> (%s)", new_line)
         LogInfo.show_line(cnt, 100000)
     fin.close()
     fout.close()
     LogInfo.end_track("Pinlei tags added.")
Beispiel #4
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on ROC using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        for i in range(0, 1871):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2 += self.get_similarity(q_vec_map[word1],
                                                      o_vec_map[word2])

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 1871, correct, 1871)
        LogInfo.end_track()
Beispiel #5
0
 def load_pinlei(self):
     LogInfo.begin_track("Load pinlei names...")
     with codecs.open(self.root_fp + "/raw/kg_pinlei_id",
                      'r',
                      encoding='utf-8') as fin:
         for line in fin:
             spt = line.strip().split("\t")
             if len(spt) < 2:
                 continue
             pinlei = spt[0]
             self.pinlei_set.add(pinlei)
     LogInfo.end_track("%d names loaded.", len(self.pinlei_set))
Beispiel #6
0
    def load_vocab_embedding(self, embedding_file, encoding):
        LogInfo.begin_track("Loading embeddings from %s...", embedding_file)
        vocab_embedding = len(self.vocab_index_dict) * [None]
        with codecs.open(embedding_file, 'r', encoding=encoding) as fin:
            count = 0
            for line in fin:
                strs = line.split()
                embedding = [float(strs[i].strip()) for i in range(1, len(strs))]
                vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding
                count += 1
                LogInfo.show_line(count, 50000)

        assert count == len(vocab_embedding)
        self.vocab_embedding = np.asarray(vocab_embedding)
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
Beispiel #7
0
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     context_idxs, context_seqs, pinlei_idxs = list(), list(), list()
     cnt = 0
     with codecs.open(data_file, 'r', encoding=encoding) as fin:
         for line in fin:
             context_idx, context_seq, pinlei_idx = self.decode_line(line)
             context_idxs.append(context_idx)
             context_seqs.append(context_seq)
             pinlei_idxs.append(pinlei_idx)
             cnt += 1
             LogInfo.show_line(cnt, 10000)
     self.data = list(zip(context_idxs, context_seqs, pinlei_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track()
Beispiel #8
0
    def load_vocab_name(self, vocab_file, encoding):
        LogInfo.begin_track("Loading vocab from %s...", vocab_file)
        self.vocab_size = 0
        self.index_vocab_dict.clear()
        self.vocab_index_dict.clear()
        with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
            index = 0
            for line in fin:
                self.vocab_index_dict[line.strip()] = index
                self.index_vocab_dict.append(line.strip())
                index += 1
                LogInfo.show_line(index, 50000)

        self.vocab_size = index
        LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
Beispiel #9
0
def main():
    copa, worddic = readcopa()
    label = readlabel()
    cdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt")
    enegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt")
    cnegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt")
    edic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt")

    verbose = False

    import sys
    mode = sys.argv[1]
    if mode == 'full':
        for ratio in range(21):
            for lamd in range(11):
                acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, 4, True, ratio*0.1, verbose)
                print ratio*0.1, lamd*0.1, acc
        # print "word pair with norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, True, verbose)
        #         print lamd*0.1, setting, acc

        # print "\nword pair without norm:"
        # for setting in range(3):
        #     for lamd in range(11):
        #         acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, False, verbose)
        #         print lamd*0.1, setting, acc
        #
        # print "\nsentence level with norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, True)
        #     print lamd*0.1,  acc
        #
        # print "\nsentence level without norm:"
        # for lamd in range(11):
        #     acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, False)
        #     print lamd*0.1, acc
    elif mode == 'case':
        para1 = float(sys.argv[2])
        para2 = int(sys.argv[3])
        LogInfo.begin_track("case tracing for word-pair & lambda=%.1f, setting=%d:", para1, para2)
        verbose = True
        acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, para1, 500, para2, True, verbose)
        LogInfo.logs("[Accuracy] %.4f", acc)
        LogInfo.end_track()
Beispiel #10
0
    def eval_avg_lambda(self, lamb=1.0):
        """
        sentence representation = average of word vectors
        :return: final acc.
        """
        LogInfo.begin_track(
            "Eval on Copa using average word representations using lambda %.2f...",
            lamb)
        correct = 0
        for i in range(500, 1000):
            ask4 = self.copa_ground[i][0]
            sentence, option1, option2 = self.copa_data[i]
            sent_vec = self.get_repr(sentence, ask4, 1, 'q')
            opt1_vec = self.get_repr(option1, ask4, 1, 'o')
            opt2_vec = self.get_repr(option2, ask4, 1, 'o')
            score1a = self.get_similarity(sent_vec, opt1_vec)
            score2a = self.get_similarity(sent_vec, opt2_vec)

            sent_vec = self.get_repr(sentence, ask4, 2, 'q')
            opt1_vec = self.get_repr(option1, ask4, 2, 'o')
            opt2_vec = self.get_repr(option2, ask4, 2, 'o')
            score1b = self.get_similarity(sent_vec, opt1_vec)
            score2b = self.get_similarity(sent_vec, opt2_vec)

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            # LogInfo.logs("[log] %.4f(%.2f^%.2f*%.2f^%.2f) ||| %.4f(%.2f^%.2f*%.2f^%.2f)",
            #              score1, score1a, lamb, score1b, 1-lamb,
            #              score2, score2a, lamb, score2b, 1-lamb)
            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.end_track()
Beispiel #11
0
    def prepare_model_data(self, pinlei_num):
        self.pinlei_num = pinlei_num
        LogInfo.begin_track("Generate Multi-Pinlei Data for evaluation...")
        fin = codecs.open(self.root_fp + "/query_label.txt." +
                          str(self.pinlei_num),
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".name",
                           'w',
                           encoding='utf-8')
        fsho = codecs.open(self.root_fp + "/model_data_test." +
                           str(self.pinlei_num) + ".check",
                           'w',
                           encoding='utf-8')
        cnt = 0
        not_cover = set()
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            is_cover = True
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                if pinlei not in self.pinlei:
                    # LogInfo.logs("%s not cover.", pinlei)
                    is_cover = False
                    not_cover.add(pinlei)
            if not is_cover:
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                continue
            for i in range(2, 2 + self.pinlei_num):
                pinlei = "[[" + spt[i] + "]]"
                fout.write(context + "\t" + pinlei + "\n")
                fsho.write(spt[0] + "\n")

        fin.close()
        fout.close()
        fsho.close()
        LogInfo.end_track("%d pinlei not cover.", len(not_cover))
Beispiel #12
0
def load_configs(fp):
    LogInfo.begin_track('Loading config from %s: ', fp)
    config_dict = {}
    with open(fp, 'r') as br:
        for line in br.readlines():
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            if line.find('\t') == -1:
                continue
            spt = line.split('\t')
            if len(spt) < 3:
                LogInfo.logs("[%s] is invalid, pls add type!", line)
                continue
            k = spt[0]
            v_str = spt[1]
            t = spt[2]
            if t == "d" or t == "int":
                config_dict[k] = int(v_str)
            elif t == "f" or t == "float" or t == "double":
                config_dict[k] = float(v_str)
            elif t == "b" or t == "bool":
                if v_str == "true" or v_str == "True" \
                        or v_str == "TRUE" or v_str == "1":
                    config_dict[k] = True
                else:
                    config_dict[k] = False
            elif t == "tf" or t == "tensorflow":
                if v_str == 'relu':
                    config_dict[k] = tf.nn.relu
                elif v_str == 'sigmoid':
                    config_dict[k] = tf.nn.sigmoid
                elif v_str == 'tanh':
                    config_dict[k] = tf.nn.tanh
            elif t == "None" or v_str == "None":
                config_dict[k] = None
            else:
                config_dict[k] = v_str
            LogInfo.logs('%s = %s', k, v_str)

    LogInfo.end_track()
    return config_dict
Beispiel #13
0
    def prepare_model_data(self):
        LogInfo.begin_track("Generate model data...")
        # .1 means single pinlei
        fin = codecs.open(self.root_fp + "/query_label.txt.1",
                          'r',
                          encoding='utf-8')
        fout = codecs.open(self.root_fp + "/model_data_train.name",
                           'w',
                           encoding='utf-8')
        not_cover = 0
        not_context = 0
        cnt = 0
        for line in fin:
            cnt += 1
            if cnt % 100000 == 0:
                LogInfo.logs("%d lines processed.", cnt)
                fout.flush()
            spt = line.strip().split("\t")
            context = spt[1]
            pinlei = "[[" + spt[2] + "]]"
            if pinlei not in self.pinlei:
                not_cover += 1
                continue
            if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15:
                not_context += 1
                continue
            fout.write(context + "\t" + pinlei + "\n")
            negs = self.neg_sample_random(pinlei, 19)
            for neg in negs:
                fout.write(context + "\t" + neg + "\n")

        fin.close()
        fout.close()
        LogInfo.end_track("Model data prepared. Size: %d. (%d, %d).",
                          cnt - not_context - not_cover, not_cover,
                          not_context)
Beispiel #14
0
 def load_vocab(self, vocab_file, embedding_dim, encoding):
     LogInfo.begin_track("Loading vocab from %s...", vocab_file)
     self.vocab_size = 0
     self.index_vocab_dict.clear()
     self.vocab_index_dict.clear()
     self.vocab_embedding.clear()
     with codecs.open(vocab_file, 'r', encoding=encoding) as fin:
         index = 0
         # 0 embedding for not-found query term
         self.vocab_index_dict["[[NULL]]"] = index
         self.index_vocab_dict.append("[[NULL]]")
         self.vocab_embedding.append([0.0 for _ in range(embedding_dim)])
         index += 1
         for line in fin:
             spt = line.strip().split()
             self.vocab_index_dict[spt[0]] = index
             self.index_vocab_dict.append(spt[0])
             embedding = [float(spt[i].strip()) for i in range(1, len(spt))]
             self.vocab_embedding.append(embedding)
             index += 1
             LogInfo.show_line(index, 50000)
     self.vocab_size = len(self.vocab_embedding)
     self.vocab_embedding = np.array(self.vocab_embedding)
     LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
Beispiel #15
0
 def load(self, data_file, encoding):
     LogInfo.begin_track("Loading data from %s...", data_file)
     if os.path.isfile(data_file):
         LogInfo.begin_track("[Exist] Loading from %s...", data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         with codecs.open(data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 spt = line.strip().split("\t")
                 query_idxs.append([int(idx) for idx in spt[0].split(" ")])
                 query_lens.append(int(spt[1]))
                 labels.append([int(idx) for idx in spt[2].split(" ")])
                 intents.append(int(spt[3]))
                 link_masks.append([int(idx) for idx in spt[4].split(" ")])
                 entity_idxs.append([int(idx) for idx in spt[5].split(" ")])
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len)
     else:
         txt_data_file = data_file + ".name"
         LogInfo.begin_track("[Not Exist] Loading from %s...",
                             txt_data_file)
         query_idxs, query_lens, labels, intents, link_masks, entity_idxs \
             = list(), list(), list(), list(), list(), list()
         cnt = 0
         fout = codecs.open(data_file, 'w', encoding=encoding)
         with codecs.open(txt_data_file, 'r', encoding=encoding) as fin:
             for line in fin:
                 query_idx, query_len, label, intent, link_mask, entity_idx\
                     = self.decode_line(line)
                 fout.write(" ".join([str(x) for x in query_idx]) + "\t" +
                            str(query_len) + "\t" +
                            " ".join([str(x) for x in label]) + "\t" +
                            str(intent) + "\t" +
                            " ".join([str(x) for x in link_mask]) + "\t" +
                            " ".join([str(x) for x in entity_idx]) + "\n")
                 query_idxs.append(query_idx)
                 query_lens.append(query_len)
                 labels.append(label)
                 intents.append(intent)
                 link_masks.append(link_mask)
                 entity_idxs.append(entity_idx)
                 cnt += 1
                 LogInfo.show_line(cnt, 1000000)
         fout.close()
         LogInfo.logs("Write into %s.", data_file)
         LogInfo.end_track("Max_seq_len = %d.", self.max)
     self.data = list(
         zip(query_idxs, query_lens, labels, intents, link_masks,
             entity_idxs))
     self.data_size = len(self.data)
     LogInfo.end_track("Loaded. Size: %d.", self.data_size)
Beispiel #16
0
def eval_seq_crf_with_o_atis(y_pred_, y_true_, method='precision'):
    """
    Evaluation for ATIS dataset, including "Outside"
    under specific conditions (3-class)
    :param y_pred_: [B, T, ]
    :param y_true_: [B, T, ]
    :param method: precision/ recall
    :return: f1 score
    """
    # LogInfo.logs("y_pred: %s", '\n'.join([str(x) for x in y_pred_]))
    # LogInfo.logs("y_true: %s", '\n'.join([str(x) for x in y_true_]))

    tag_dict = [
        'O', 'B-day_number', 'B-stoploc.state_code', 'B-toloc.state_code',
        'B-time_relative', 'B-fromloc.state_code', 'B-stoploc.airport_code',
        'B-airline_code', 'B-connect', 'B-depart_time.period_mod', 'B-flight',
        'B-arrive_time.period_mod', 'B-booking_class', 'B-month_name',
        'B-return_date.day_name', 'B-depart_date.month_name',
        'B-arrive_date.today_relative', 'B-return_time.period_of_day',
        'B-aircraft_code', 'B-arrive_date.date_relative', 'B-state_code',
        'B-days_code', 'B-airport_code', 'B-period_of_day',
        'B-arrive_date.day_name', 'B-flight_days', 'B-return_time.period_mod',
        'B-fromloc.airport_code', 'B-arrive_date.month_name', 'B-mod',
        'B-stoploc.airport_name', 'B-compartment', 'B-toloc.airport_code',
        'B-depart_date.date_relative', 'B-day_name', 'B-or',
        'B-depart_date.year', 'B-depart_date.day_name', 'B-toloc.country_name',
        'B-return_date.month_name', 'B-meal', 'B-stoploc.city_name',
        'I-stoploc.city_name', 'B-round_trip', 'I-round_trip', 'B-state_name',
        'I-state_name', 'B-fromloc.city_name', 'I-fromloc.city_name',
        'B-airline_name', 'I-airline_name', 'B-flight_stop', 'I-flight_stop',
        'B-fromloc.airport_name', 'I-fromloc.airport_name',
        'B-arrive_time.start_time', 'I-arrive_time.start_time',
        'B-cost_relative', 'I-cost_relative', 'B-city_name', 'I-city_name',
        'B-arrive_time.end_time', 'I-arrive_time.end_time', 'B-meal_code',
        'I-meal_code', 'B-depart_date.day_number', 'I-depart_date.day_number',
        'B-meal_description', 'I-meal_description', 'B-arrive_time.time',
        'I-arrive_time.time', 'B-depart_date.today_relative',
        'I-depart_date.today_relative', 'B-fare_amount', 'I-fare_amount',
        'B-airport_name', 'I-airport_name', 'B-flight_time', 'I-flight_time',
        'B-flight_number', 'I-flight_number', 'B-toloc.airport_name',
        'I-toloc.airport_name', 'B-flight_mod', 'I-flight_mod',
        'B-depart_time.time_relative', 'I-depart_time.time_relative',
        'B-return_date.date_relative', 'I-return_date.date_relative',
        'B-economy', 'I-economy', 'B-class_type', 'I-class_type',
        'B-toloc.state_name', 'I-toloc.state_name',
        'B-arrive_time.period_of_day', 'I-arrive_time.period_of_day',
        'B-toloc.city_name', 'I-toloc.city_name', 'B-depart_time.start_time',
        'I-depart_time.start_time', 'B-return_date.day_number',
        'I-return_date.day_number', 'B-today_relative', 'I-today_relative',
        'B-depart_time.end_time', 'I-depart_time.end_time',
        'B-fromloc.state_name', 'I-fromloc.state_name', 'B-depart_time.time',
        'I-depart_time.time', 'B-return_date.today_relative',
        'I-return_date.today_relative', 'B-fare_basis_code',
        'I-fare_basis_code', 'B-arrive_date.day_number',
        'I-arrive_date.day_number', 'B-restriction_code', 'I-restriction_code',
        'B-transport_type', 'I-transport_type', 'B-time', 'I-time',
        'B-arrive_time.time_relative', 'I-arrive_time.time_relative',
        'B-depart_time.period_of_day', 'I-depart_time.period_of_day'
    ]

    LogInfo.begin_track("Eval seq %s on %d tags...", method, len(tag_dict))
    if method == 'precision':
        y_pred = np.array(y_pred_)
        y_true = np.array(y_true_)
    elif method == 'recall':
        y_pred = np.array(y_true_)
        y_true = np.array(y_pred_)

    names = set()
    for tag in tag_dict:
        if tag == 'O':
            names.add('O')
        else:
            names.add(tag[2:])
    LogInfo.logs("%d different terms", len(names))
    correct = dict()
    act_cnt = dict()
    for name in names:
        correct[name] = 0
        act_cnt[name] = 0

    for line_pred, line_true in zip(y_pred, y_true):
        i = 0
        cnt = len(line_pred)
        while i < cnt:
            tag_num = line_pred[i]
            tag = tag_dict[tag_num]
            if tag_num <= 40:
                # tags with "B" without "I", including "O"
                if tag_num == 0:
                    kind = 'O'
                else:
                    kind = tag[2:]
                act_cnt[kind] += 1
                if line_true[i] == line_pred[i]:
                    correct[kind] += 1
                i += 1
                continue
            else:
                kind = tag[2:]
                sign = tag[0]
            if sign == 'B':
                j = i + 1
                while j < cnt:
                    next_tag = tag_dict[line_pred[j]]
                    if next_tag[2:] == kind and next_tag[0] == 'I':
                        j += 1
                    else:
                        break
            else:
                i += 1
                continue

            act_cnt[kind] += 1

            act_label = ' '.join([str(x) for x in line_true[i:j]])
            proposed_label = ' '.join([str(x) for x in line_pred[i:j]])
            if act_label == proposed_label and (
                    j == cnt or line_true[j] != line_true[i] + 1):
                correct[kind] += 1
            i = j

    ret = dict()
    keys = act_cnt.keys()
    correct_total = 0
    cnt_total = 0
    for key in keys:
        if act_cnt[key] == 0:
            ret[key] = 0.0
        else:
            ret[key] = correct[key] * 1.0 / act_cnt[key]
            LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key],
                         act_cnt[key])
        correct_total += correct[key]
        cnt_total += act_cnt[key]
        if cnt_total == 0:
            overall = 0.0
        else:
            overall = correct_total * 1.0 / cnt_total
    LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total,
                 cnt_total)
    LogInfo.end_track()
    return overall
Beispiel #17
0
    def process_query(self):
        LogInfo.begin_track("Begin adding tags for queries...")
        fin = codecs.open(self.root_fp + "/query.txt", 'r', encoding='utf-8')
        fout = codecs.open(self.root_fp + "/query_label.txt",
                           'w',
                           encoding='utf-8')
        cnt = 0
        for line in fin:
            spt = line.strip().split()
            new_line = ""
            context = ""
            label = set()
            i = 0
            while i < len(spt):
                if i + 4 < len(spt):
                    str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i +
                                                                  3] + spt[i +
                                                                           4]
                    if str5 in self.pinlei_set:
                        LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3], spt[i + 4])
                        label.add(str5)
                        new_line += "[[" + str5 + "]] "
                        i += 5
                        continue
                if i + 3 < len(spt):
                    str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3]
                    if str4 in self.pinlei_set:
                        LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]",
                                     spt[i], spt[i + 1], spt[i + 2],
                                     spt[i + 3])
                        label.add(str4)
                        new_line += "[[" + str4 + "]] "
                        i += 4
                        continue
                if i + 2 < len(spt):
                    str3 = spt[i] + spt[i + 1] + spt[i + 2]
                    if str3 in self.pinlei_set:
                        LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i],
                                     spt[i + 1], spt[i + 2])
                        label.add(str3)
                        new_line += "[[" + str3 + "]] "
                        i += 3
                        continue
                if i + 1 < len(spt):
                    str2 = spt[i] + spt[i + 1]
                    if str2 in self.pinlei_set:
                        # LogInfo.logs("Found 2-term pinlei [%s|%s]",
                        #              spt[i], spt[i+1])
                        label.add(str2)
                        new_line += "[[" + str2 + "]] "
                        i += 2
                        continue
                if spt[i] in self.pinlei_set:
                    # LogInfo.logs("Found pinlei [%s]", spt[i])
                    label.add(spt[i])
                    new_line += "[[" + spt[i] + "]] "
                    i += 1
                    continue
                context += spt[i] + " "
                new_line += spt[i] + " "
                i += 1

            if len(label) != 0:
                ret = new_line.strip() + "\t" + \
                      context.strip() + "\t" + \
                      "\t".join(label) + "\n"
            else:
                ret = new_line.strip() + "\n"
            fout.write(ret)
            cnt += 1
            if cnt < 5:
                LogInfo.logs("res ==> (%s)", ret.strip())
            LogInfo.show_line(cnt, 100000)
        fin.close()
        fout.close()
        LogInfo.end_track("Query processed.")
Beispiel #18
0
    def eval_pair_lambda(self, lamb=1.0, strategy=1):
        """
        evaluation based on word pairs
        :param lamb:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using lambda %.2f and strategy %d...",
            lamb, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            # left
            q_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='o')
            score1a = 0.0
            score2a = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2a += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            # right
            q_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='o')
            score1b = 0.0
            score2b = 0.0
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score1b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        score2b += self.get_similarity(q_vec_map[word1],
                                                       o_vec_map[word2])

            score1 = (score1a * lamb) + (score1b * (1 - lamb))
            score2 = (score2a * lamb) + (score2b * (1 - lamb))
            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if ask4 == 'cause':
                        cause_correct += 1
                    else:
                        effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        LogInfo.logs("[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                     float(cause_correct) / cause, cause_correct, cause,
                     float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
Beispiel #19
0
def eval_seq_softmax(raw_score, y_true, method='precision'):
    """
    Evaluation for sequence labeling
    under specific conditions (3-class)
    :param raw_score: [B, T, class_dim]
    :param y_true: [B, T, ]
    :param method: precision/ recall
    :return: f1 score
    """
    tag_dict = ['O', 'PL_B', 'PL_I', 'PK_B', 'PK_I', 'PV_B', 'PV_I']
    LogInfo.begin_track("Eval seq %s...", method)
    if method == 'precision':
        y_pred = np.argmax(raw_score, axis=1).reshape((-1))
        y_true = np.array(y_true).reshape((-1))
    elif method == 'recall':
        y_pred = np.array(y_true).reshape((-1))
        y_true = np.argmax(raw_score, axis=1).reshape((-1))

    # LogInfo.logs("y_pred: [%s]", ' '.join([str(x) for x in y_pred]))
    # LogInfo.logs("y_true: [%s]", ' '.join([str(x) for x in y_true]))
    # LogInfo.logs("y_pred: [%s]", y_pred)
    # LogInfo.logs("y_true: [%s]", y_true)

    cnt = len(y_pred)
    i = 0
    correct = {'PL': 0, 'PK': 0, 'PV': 0}
    act_cnt = {'PL': 0, 'PK': 0, 'PV': 0}
    while i < cnt:
        tag_num = y_pred[i]
        tag = tag_dict[tag_num]
        if tag == 'O':
            i += 1
            continue
        else:
            kind = tag[:2]
            sign = tag[3]
        if sign == 'B':
            j = i + 1
            while j < cnt:
                next_tag = tag_dict[y_pred[j]]
                if next_tag[:2] == kind and next_tag[3] == 'I':
                    j += 1
                else:
                    break
        else:
            i += 1
            continue

        act_cnt[kind] += 1

        act_label = ' '.join([str(x) for x in y_true[i:j]])
        proposed_label = ' '.join([str(x) for x in y_pred[i:j]])
        if act_label == proposed_label and (j == cnt
                                            or y_true[j] != y_true[i] + 1):
            correct[kind] += 1
        i = j

    ret = dict()
    keys = act_cnt.keys()
    correct_total = 0
    cnt_total = 0
    for key in keys:
        if act_cnt[key] == 0:
            ret[key] = 0.0
        else:
            ret[key] = correct[key] * 1.0 / act_cnt[key]
        LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key],
                     act_cnt[key])
        correct_total += correct[key]
        cnt_total += act_cnt[key]
        if cnt_total == 0:
            overall = 0.0
        else:
            overall = correct_total * 1.0 / cnt_total
    LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total,
                 cnt_total)
    LogInfo.end_track()
    return overall
Beispiel #20
0
    def eval_pair(self, setting=1, strategy=1):
        """
        evaluation based on word pairs
        :param setting:
        :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2
        :return: final acc. 
        """
        LogInfo.begin_track(
            "Eval on Copa using word pairs using setting %d and strategy %d...",
            setting, strategy)
        correct = 0
        cause = 0
        effect = 0
        cause_correct = 0
        effect_correct = 0
        for i in range(500, 1000):
            sentence, option1, option2 = self.copa_data[i]
            ask4 = self.copa_ground[i][0]
            if ask4 == 'cause':
                cause += 1
            else:
                effect += 1
            q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q')
            o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o')
            score1 = 0.0
            score2 = 0.0
            show_list1 = list()
            show_list2 = list()
            for word1 in sentence:
                for word2 in option1:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score1 += tmp
                        show_list1.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            for word1 in sentence:
                for word2 in option2:
                    if word1 in q_vec_map and word2 in o_vec_map:
                        tmp = self.get_similarity(q_vec_map[word1],
                                                  o_vec_map[word2])
                        score2 += tmp
                        show_list2.append("(%s, %s)-->%.2f" %
                                          (word1, word2, tmp))

            # LogInfo.logs("[%d] Q: %s", i+1, ' '.join(sentence))
            # LogInfo.logs("[%d] O1: %s", i+1, ' '.join(option1))
            # LogInfo.logs("[%d] O2: %s", i+1, ' '.join(option2))
            # LogInfo.logs("[%d] ask4: [%s].", i+1, ask4)
            #
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list1))
            # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list2))

            if strategy == 2:
                score1 /= (len(sentence) + len(option1))
                score2 /= (len(sentence) + len(option2))
            elif strategy == 3:
                score1 /= (len(sentence) * len(option1))
                score2 /= (len(sentence) * len(option2))

            truth = self.copa_ground[i][1]
            if score1 > score2:
                if truth == 1:
                    # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [T]", i+1, 1, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1

                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [F]", i+1, 1, score1, score2, truth)
            else:
                if truth == 2:
                    # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [T]", i+1, 2, score1, score2, truth)
                    correct += 1
                    if setting == 3:
                        if ask4 == 'cause':
                            cause_correct += 1
                        else:
                            effect_correct += 1
                # else:
                #     LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [F]", i+1, 2, score1, score2, truth)

        LogInfo.logs("[summary] accuracy: %.4f(%d/%d).",
                     float(correct) / 500, correct, 500)
        if setting == 3:
            LogInfo.logs(
                "[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)",
                float(cause_correct) / cause, cause_correct, cause,
                float(effect_correct) / effect, effect_correct, effect)
        LogInfo.end_track()
Beispiel #21
0
def eval_seq_crf(y_pred_, y_true_, method='precision'):
    """
    Evaluation for sequence labeling, without "Outside"
    under specific conditions (3-class)
    :param y_pred_: [B, T, ]
    :param y_true_: [B, T, ]
    :param method: precision/ recall
    :return: f1 score
    """
    # LogInfo.logs("y_pred: %s", '\n'.join([str(x) for x in y_pred_]))
    # LogInfo.logs("y_true: %s", '\n'.join([str(x) for x in y_true_]))

    tag_dict = ['O', 'PL_B', 'PL_I', 'PK_B', 'PK_I', 'PV_B', 'PV_I']
    LogInfo.begin_track("Eval seq %s...", method)
    if method == 'precision':
        y_pred = np.array(y_pred_)
        y_true = np.array(y_true_)
    elif method == 'recall':
        y_pred = np.array(y_true_)
        y_true = np.array(y_pred_)

    correct = {'PL': 0, 'PK': 0, 'PV': 0}
    act_cnt = {'PL': 0, 'PK': 0, 'PV': 0}
    for line_pred, line_true in zip(y_pred, y_true):
        i = 0
        cnt = len(line_pred)
        while i < cnt:
            tag_num = line_pred[i]
            tag = tag_dict[tag_num]
            if tag == 'O':
                i += 1
                continue
            else:
                kind = tag[:2]
                sign = tag[3]
            if sign == 'B':
                j = i + 1
                while j < cnt:
                    next_tag = tag_dict[line_pred[j]]
                    if next_tag[:2] == kind and next_tag[3] == 'I':
                        j += 1
                    else:
                        break
            else:
                i += 1
                continue

            act_cnt[kind] += 1

            act_label = ' '.join([str(x) for x in line_true[i:j]])
            proposed_label = ' '.join([str(x) for x in line_pred[i:j]])
            if act_label == proposed_label and (
                    j == cnt or line_true[j] != line_true[i] + 1):
                correct[kind] += 1
            i = j

    ret = dict()
    keys = act_cnt.keys()
    correct_total = 0
    cnt_total = 0
    for key in keys:
        if act_cnt[key] == 0:
            ret[key] = 0.0
        else:
            ret[key] = correct[key] * 1.0 / act_cnt[key]
        LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key],
                     act_cnt[key])
        correct_total += correct[key]
        cnt_total += act_cnt[key]
        if cnt_total == 0:
            overall = 0.0
        else:
            overall = correct_total * 1.0 / cnt_total
    LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total,
                 cnt_total)
    LogInfo.end_track()
    return overall
Beispiel #22
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync[wordc] = vecc
                    self.syne_neg[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                try:
                    vecc = map(lambda x: float(x), sptc[1:])
                    vece = map(lambda x: float(x), spte[1:])
                    self.sync_neg[wordc] = vecc
                    self.syne[worde] = vece
                except ValueError:
                    LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3])
                    continue
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_phr.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = map(lambda x: x.split(':')[1],
                               raw_sentence.strip().split())
                option1 = map(lambda x: x.split(':')[1],
                              raw_option1.strip().split())
                option2 = map(lambda x: x.split(':')[1],
                              raw_option2.strip().split())
                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
Beispiel #23
0
    try_dir = sys.argv[2]
    root_path = 'runnings/%s/%s' % (setting_dir, try_dir)
    config_path = '%s/param_config' % root_path
    config = ConfigDict(config_path)

    vocab_loader = VocabularyLoader()
    vocab_loader.load_vocab(config.get("vocab_fp"),
                            config.get("embedding_dim"), 'utf-8')
    config.add("vocab_size", vocab_loader.vocab_size)
    LogInfo.logs("Embedding shape: %s.", vocab_loader.vocab_embedding.shape)

    data_loader = DataLoader(config.get("max_seq_len"),
                             vocab_loader.vocab_index_dict)
    data_loader.load(config.get("test_data_fp"), 'utf-8')

    LogInfo.begin_track("Create models...")
    graph = tf.Graph()
    with graph.as_default():
        test_model = IntentionIdentifier(
            config=config,
            mode=tf.contrib.learn.ModeKeys.EVAL,
            embedding_vocab=vocab_loader.vocab_embedding)
        LogInfo.logs("Test model created.")
    LogInfo.end_track()

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    LogInfo.begin_track("Start testing...")
    with tf.Session(graph=graph, config=tf_config) as session:
        test_model.load(session, root_path + "/model/best_model")
Beispiel #24
0
            spt = line.strip().split()
            if len(spt) < 3:
                LogInfo.logs("[error] bad line: %s", line)
            # store the sub-pinlei to delete in following algorithm
            pinlei_pairs['[[' + spt[0] + ']] [[' + spt[1] +
                         ']]'] = '[[' + spt[1] + ']]'
            pinlei_pairs['[[' + spt[1] + ']] [[' + spt[0] +
                         ']]'] = '[[' + spt[1] + ']]'
    LogInfo.logs("%d pinlei pairs loaded.", len(pinlei_pairs))

    # data transformer
    data_feeder = DataLoader(config.get("max_seq_len"),
                             vocab_loader.vocab_index_dict)

    # create model
    LogInfo.begin_track("Create models...")
    graph = tf.Graph()
    with graph.as_default():
        test_model = IntentionIdentifier(
            config=config,
            mode=tf.contrib.learn.ModeKeys.TRAIN,
            embedding_vocab=vocab_loader.vocab_embedding)
        LogInfo.logs("Test model created.")
    LogInfo.end_track()

    # tensorflow configuration
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    # testing started
    LogInfo.begin_track("Start testing...")
Beispiel #25
0
def word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd, num, setting, norm, ratio, verbose=False):
    acc = 0
    wrong = 0
    for i in range(num, 1000):
        hyp, alt1, alt2 = copa[i]
        ask, labl = label[i]
        if verbose:
            LogInfo.begin_track("step into copa #%d", i+1)
            LogInfo.logs("q: %s", hyp)
            LogInfo.logs("o1: %s", alt1)
            LogInfo.logs("o2: %s", alt2)
            LogInfo.logs("answer: o%d", labl)
        # ask for cause
        if ask == 0:
            if verbose:
                LogInfo.begin_track("[ask for cause] o1/o2 -> q")
            cause, effect = alt1, hyp
            if verbose:
                LogInfo.begin_track("o1->q: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()

            cause, effect = alt2, hyp
            if verbose:
                LogInfo.begin_track("o2->q: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()

            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs("[[correct]]")
            if verbose:
                LogInfo.end_track()

        # ask for effect
        elif ask == 1:
            if verbose:
                LogInfo.begin_track("[ask for effect] q -> o1/o2")
            cause, effect = hyp, alt1
            if verbose:
                LogInfo.begin_track("q->o1: [%s]->[%s]", cause, effect)
            score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score1)
                LogInfo.end_track()
            cause, effect = hyp, alt2
            if verbose:
                LogInfo.begin_track("q->o2: [%s]->[%s]", cause, effect)
            score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose)
            if verbose:
                LogInfo.logs("final score: %.4f", score2)
                LogInfo.end_track()
            if score1 > score2 and labl == 1:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            elif score1 < score2 and labl == 2:
                acc += 1
                if verbose:
                    LogInfo.logs(">>correct<<")
            else:
                wrong += 1
                if verbose:
                    LogInfo.logs(">>wrong<<")
            if verbose:
                LogInfo.end_track()
        else:
            print ask
            if verbose:
                LogInfo.logs("[error] ask=%d", ask)

        if verbose:
            LogInfo.end_track("end for #%d", i+1)
            LogInfo.logs("===========")

    if verbose:
        LogInfo.logs("status: %dY-%dW/%d", acc, wrong, 1000-num)
    return acc*1.0/(1000-num)
Beispiel #26
0
    context_idx_test, context_seq_test, pinlei_idx_test = zip(*test_data)
    context_idx_test = np.array(context_idx_test)
    context_seq_test = np.array(context_seq_test)
    pinlei_idx_test = np.array(pinlei_idx_test)
    test_data = [context_idx_test, context_seq_test, pinlei_idx_test]

    data_loader.data.clear()
    LogInfo.logs("train: valid: test = %d: %d: %d.", train_size, valid_size,
                 test_size)
    # LogInfo.logs("train data: %s", train_data)
    # LogInfo.logs("valid data: %s", valid_data)
    # LogInfo.logs("test data: %s", test_data)

    batch_generator = BatchGenerator(train_data, config.get("batch_size"))

    LogInfo.begin_track("Create models...")
    graph = tf.Graph()
    with graph.as_default():
        train_model = IntentionIdentifier(
            config=config,
            mode=tf.contrib.learn.ModeKeys.TRAIN,
            embedding_vocab=vocab_loader.vocab_embedding)
        LogInfo.logs("Train model created.")

        # all get_variable parameters will be reused in eval
        tf.get_variable_scope().reuse_variables()

        eval_model = IntentionIdentifier(
            config=config,
            mode=tf.contrib.learn.ModeKeys.EVAL,
            embedding_vocab=vocab_loader.vocab_embedding)
Beispiel #27
0
    def load_data(self):
        """
        load data from files
        :return: 
        """
        LogInfo.begin_track("Loading data...")
        # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
        #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
        with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync[wordc] = vecc
                self.syne_neg[worde] = vece
        LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).",
                     len(self.sync), len(self.syne_neg))

        with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \
                open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine:
            # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \
            #         open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine:
            cnt = 0
            for linec, linee in zip(finc, fine):
                cnt += 1
                LogInfo.show_line(cnt, 100000)
                sptc = linec.strip().split()
                spte = linee.strip().split()
                wordc = sptc[0]
                worde = spte[0]
                vecc = map(lambda x: float(x), sptc[1:])
                vece = map(lambda x: float(x), spte[1:])
                self.sync_neg[wordc] = vecc
                self.syne[worde] = vece
        LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).",
                     len(self.sync_neg), len(self.syne))

        # NN, JJ, VB
        with open("/home/yuchen/data/copa_lem.txt") as fin:
            for i in range(1000):
                raw_sentence = fin.readline()
                raw_option1 = fin.readline()
                raw_option2 = fin.readline()
                sentence = list()
                option1 = list()
                option2 = list()
                for word in raw_sentence.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        sentence.append(word.split(':')[1])
                for word in raw_option1.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option1.append(word.split(':')[1])
                for word in raw_option2.strip().split():
                    if word.startswith('NN') or word.startswith(
                            'JJ') or word.startswith('VB'):
                        option2.append(word.split(':')[1])

                self.copa_data.append([sentence, option1, option2])
        LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data))

        with open("/home/yuchen/data/copa_label.txt") as fin:
            for line in fin:
                spt = line.strip().split('\t')
                self.copa_ground.append([spt[1], int(spt[2])])
        LogInfo.logs("[log] copa ground truth loaded (%d).",
                     len(self.copa_ground))
        LogInfo.end_track()
Beispiel #28
0
    set_q = set()
    set_d = set()
    for i in range(0, len(query) - 2):
        set_q.add(query[i:i + 3])
    for i in range(0, len(document) - 2):
        set_d.add(document[i:i + 3])
    intersection = set_q.intersection(set_d)
    union = set_q.union(set_d)
    return float(len(intersection)) / len(union)


if __name__ == '__main__':
    wiki_path = "/home/xusheng/wikipedia/en-extracted"
    fb_path = "/home/kangqi/Freebase/Transform"

    LogInfo.begin_track("Loading wiki-fb entity map...")
    wiki_fb_map = dict()
    cnt = 0
    with open(fb_path + "/GS-cleanWiki-triple.txt") as fin:
        for line in fin:
            spt = line.strip().split('\t')
            if len(spt) < 3:
                continue
            fb_ent = spt[0]
            wiki_ent = spt[2].split('/wiki/')[1][:-1]
            wiki_ent = wiki_ent.lower().replace('_', ' ')
            wiki_fb_map[wiki_ent] = fb_ent
            cnt += 1
            LogInfo.show_line(cnt, 500000)
    LogInfo.end_track("%d pairs in total", cnt)