def save(self, session, dir_path): import os if not (os.path.isdir(dir_path)): os.mkdir(dir_path) fp = dir_path + "/best_model" self.saver.save(session, fp) LogInfo.logs("Model saved into %s.", fp)
def load_kkv_table(file_path): kkv_table = dict() with codecs.open(file_path, 'r', encoding='utf-8') as fin: for line in fin: spt = line.strip().split() if len(spt) < 3: LogInfo.logs("[error] bad line: %s", line.strip()) kkv_table[spt[0] + ' ' + spt[1]] = spt[2] return kkv_table
def create_batches(self): if self.data_size % self.batch_size == 0: self.num_batches = int(self.data_size / self.batch_size) else: self.num_batches = int(self.data_size / self.batch_size) + 1 # When the data (tensor) is too small, let's give them a better error message if self.num_batches == 0: assert False, "Not enough data. Make seq_length and batch_size small." LogInfo.logs("Batches created. (%d)", self.num_batches)
def load_pinlei(self): LogInfo.begin_track("Load pinlei names...") with codecs.open("/u01/xusheng/word2vec/vec/yyh_pinlei.txt", 'r', encoding='utf-8') as fin: for line in fin: name = line.strip().split()[0] if name.startswith("[["): self.pinlei.add(name) LogInfo.end_track("Pinlei name loaded. Size: %d.", len(self.pinlei))
def add_pinlei_tag_yyh(self): LogInfo.begin_track("Begin adding tags for pinleis...") fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag", 'w', encoding='utf-8') cnt = 0 for line in fin: spt = line.strip().split() new_line = "" i = 0 while i < len(spt): if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3] if str4 in self.pinlei_set: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if str3 in self.pinlei_set: # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", # spt[i], spt[i+1], spt[i+2]) new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if str2 in self.pinlei_set: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) new_line += "[[" + str2 + "]] " i += 2 continue if spt[i] in self.pinlei_set: # LogInfo.logs("Found pinlei [%s]", spt[i]) new_line += "[[" + spt[i] + "]] " i += 1 continue new_line += spt[i] + " " i += 1 fout.write(new_line + "\n") cnt += 1 if cnt < 5: LogInfo.logs("res ==> (%s)", new_line) LogInfo.show_line(cnt, 100000) fin.close() fout.close() LogInfo.end_track("Pinlei tags added.")
def load_pinlei(self): LogInfo.begin_track("Load pinlei names...") with codecs.open(self.root_fp + "/raw/kg_pinlei_id", 'r', encoding='utf-8') as fin: for line in fin: spt = line.strip().split("\t") if len(spt) < 2: continue pinlei = spt[0] self.pinlei_set.add(pinlei) LogInfo.end_track("%d names loaded.", len(self.pinlei_set))
def tag_pinlei(self, query): LogInfo.logs("Tagging pinlei for your query...") spt = query.strip().split() new_line = "" context = "" label = set() i = 0 while i < len(spt): if i + 4 < len(spt): str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] + spt[i + 4] if "[[" + str5 + "]]" in self.pinlei: LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3], spt[i + 4]) label.add("[[" + str5 + "]]") new_line += "[[" + str5 + "]] " i += 5 continue if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] if "[[" + str4 + "]]" in self.pinlei: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) label.add("[[" + str4 + "]]") new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if "[[" + str3 + "]]" in self.pinlei: LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2]) label.add("[[" + str3 + "]]") new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if "[[" + str2 + "]]" in self.pinlei: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) label.add("[[" + str2 + "]]") new_line += "[[" + str2 + "]] " i += 2 continue if "[[" + spt[i] + "]]" in self.pinlei: # LogInfo.logs("Found pinlei [%s]", spt[i]) label.add("[[" + spt[i] + "]]") new_line += "[[" + spt[i] + "]] " i += 1 continue context += spt[i] + " " new_line += spt[i] + " " i += 1 return new_line.strip(), context.strip(), list(label)
def cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose=False): score = 0 num = 0 rcause = [] reffect = [] for word in cause: if word in cdic and word in cnegdic: rcause.append(word) for word in effect: if word in edic and word in enegdic: reffect.append(word) sort_map = dict() for wordc in rcause: for worde in reffect: if wordc == worde: continue score_suf = get_similar(cdic[wordc], enegdic[worde], norm) score_nec = get_similar(cnegdic[wordc], enegdic[worde], norm) tmp = lamd * score_suf + (1-lamd) * score_nec # check reverse score_reverse = get_similar(cdic[worde], enegdic[wordc], norm) if abs(score_suf-score_reverse) / min(abs(score_suf), abs(score_reverse)) < ratio: continue score += tmp num += 1 tmp_str = "[%s]-[%s] ==> %.1f*[%.2f]+%.1f*[%.2f]=[%.4f]" % \ (wordc, worde, lamd, score_suf, 1-lamd, score_nec, tmp) sort_map[tmp] = tmp_str if verbose: for line in [sort_map[k] for k in sorted(sort_map.keys(), reverse=True)]: LogInfo.logs(line) if setting == 1: return score elif setting == 2: if verbose: LogInfo.logs("%.4f / (%d+%d=%d) = %.4f", score, len(rcause), len(reffect), len(rcause)+len(reffect), score / (len(rcause) + len(reffect))) return score/(len(rcause)+len(reffect)) elif setting == 3: return score/(len(rcause)*len(reffect)) elif num == 0: if verbose: LogInfo.logs("%.4f / %d = %.4f", score, 0, 0.0) return 0.0 else: if verbose: LogInfo.logs("%.4f / %d = %.4f", score, num, score/num) return score/num
def eval_avg(self, setting=1): """ sentence representation = average of word vectors :return: final acc. """ LogInfo.begin_track( "Eval on Copa using average word representations using setting %d...", setting) correct = 0 for i in range(500, 1000): ask4 = self.copa_ground[i][0] sentence, option1, option2 = self.copa_data[i] sent_vec = self.get_repr(sentence, ask4, setting, 'q') opt1_vec = self.get_repr(option1, ask4, setting, 'o') opt2_vec = self.get_repr(option2, ask4, setting, 'o') score1 = self.get_similarity(sent_vec, opt1_vec) score2 = self.get_similarity(sent_vec, opt2_vec) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.end_track()
def fuzzy_match_name(mention, vocab, PN): """ :param mention: list of strings :param vocab: list of (string, set) tuple :param PN: number of candidates = PN-1 :return: list of strings with size PN-1 """ m_set = set() for ch in mention: m_set.add(ch) # LogInfo.begin_track("generate for %s [%s]...", mention, m_set) rank_list = TopKRankedList(PN - 1) for name, c_set in vocab.items(): score = get_jaccard_score(m_set, c_set) # LogInfo.logs("%s [%s] : %.4f", name, c_set, score) if score == 1.0: continue rank_list.push((name, score)) LogInfo.logs("Cands for %s: [%s]", mention, "|".join(rank_list.top_names())) # LogInfo.end_track() return rank_list.top_names()
def eval_pair(self, setting=1, strategy=1): """ evaluation based on word pairs :param setting: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on ROC using word pairs using setting %d and strategy %d...", setting, strategy) correct = 0 for i in range(0, 1871): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o') score1 = 0.0 score2 = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1 += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2 += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 1871, correct, 1871) LogInfo.end_track()
def load_configs(fp): LogInfo.begin_track('Loading config from %s: ', fp) config_dict = {} with open(fp, 'r') as br: for line in br.readlines(): line = line.strip() if line.startswith('#') or line == '': continue if line.find('\t') == -1: continue spt = line.split('\t') if len(spt) < 3: LogInfo.logs("[%s] is invalid, pls add type!", line) continue k = spt[0] v_str = spt[1] t = spt[2] if t == "d" or t == "int": config_dict[k] = int(v_str) elif t == "f" or t == "float" or t == "double": config_dict[k] = float(v_str) elif t == "b" or t == "bool": if v_str == "true" or v_str == "True" \ or v_str == "TRUE" or v_str == "1": config_dict[k] = True else: config_dict[k] = False elif t == "tf" or t == "tensorflow": if v_str == 'relu': config_dict[k] = tf.nn.relu elif v_str == 'sigmoid': config_dict[k] = tf.nn.sigmoid elif v_str == 'tanh': config_dict[k] = tf.nn.tanh elif t == "None" or v_str == "None": config_dict[k] = None else: config_dict[k] = v_str LogInfo.logs('%s = %s', k, v_str) LogInfo.end_track() return config_dict
def load_vocab_name(self, vocab_file, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 for line in fin: self.vocab_index_dict[line.strip()] = index self.index_vocab_dict.append(line.strip()) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = index LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) context_idxs, context_seqs, pinlei_idxs = list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: context_idx, context_seq, pinlei_idx = self.decode_line(line) context_idxs.append(context_idx) context_seqs.append(context_seq) pinlei_idxs.append(pinlei_idx) cnt += 1 LogInfo.show_line(cnt, 10000) self.data = list(zip(context_idxs, context_seqs, pinlei_idxs)) self.data_size = len(self.data) LogInfo.end_track()
def load_vocab_embedding(self, embedding_file, encoding): LogInfo.begin_track("Loading embeddings from %s...", embedding_file) vocab_embedding = len(self.vocab_index_dict) * [None] with codecs.open(embedding_file, 'r', encoding=encoding) as fin: count = 0 for line in fin: strs = line.split() embedding = [float(strs[i].strip()) for i in range(1, len(strs))] vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding count += 1 LogInfo.show_line(count, 50000) assert count == len(vocab_embedding) self.vocab_embedding = np.asarray(vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def main(): copa, worddic = readcopa() label = readlabel() cdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") enegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") cnegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") edic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") verbose = False import sys mode = sys.argv[1] if mode == 'full': for ratio in range(21): for lamd in range(11): acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, 4, True, ratio*0.1, verbose) print ratio*0.1, lamd*0.1, acc # print "word pair with norm:" # for setting in range(3): # for lamd in range(11): # acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, True, verbose) # print lamd*0.1, setting, acc # print "\nword pair without norm:" # for setting in range(3): # for lamd in range(11): # acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, False, verbose) # print lamd*0.1, setting, acc # # print "\nsentence level with norm:" # for lamd in range(11): # acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, True) # print lamd*0.1, acc # # print "\nsentence level without norm:" # for lamd in range(11): # acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, False) # print lamd*0.1, acc elif mode == 'case': para1 = float(sys.argv[2]) para2 = int(sys.argv[3]) LogInfo.begin_track("case tracing for word-pair & lambda=%.1f, setting=%d:", para1, para2) verbose = True acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, para1, 500, para2, True, verbose) LogInfo.logs("[Accuracy] %.4f", acc) LogInfo.end_track()
def eval_avg_lambda(self, lamb=1.0): """ sentence representation = average of word vectors :return: final acc. """ LogInfo.begin_track( "Eval on Copa using average word representations using lambda %.2f...", lamb) correct = 0 for i in range(500, 1000): ask4 = self.copa_ground[i][0] sentence, option1, option2 = self.copa_data[i] sent_vec = self.get_repr(sentence, ask4, 1, 'q') opt1_vec = self.get_repr(option1, ask4, 1, 'o') opt2_vec = self.get_repr(option2, ask4, 1, 'o') score1a = self.get_similarity(sent_vec, opt1_vec) score2a = self.get_similarity(sent_vec, opt2_vec) sent_vec = self.get_repr(sentence, ask4, 2, 'q') opt1_vec = self.get_repr(option1, ask4, 2, 'o') opt2_vec = self.get_repr(option2, ask4, 2, 'o') score1b = self.get_similarity(sent_vec, opt1_vec) score2b = self.get_similarity(sent_vec, opt2_vec) score1 = (score1a * lamb) + (score1b * (1 - lamb)) score2 = (score2a * lamb) + (score2b * (1 - lamb)) # LogInfo.logs("[log] %.4f(%.2f^%.2f*%.2f^%.2f) ||| %.4f(%.2f^%.2f*%.2f^%.2f)", # score1, score1a, lamb, score1b, 1-lamb, # score2, score2a, lamb, score2b, 1-lamb) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.end_track()
def prepare_model_data(self, pinlei_num): self.pinlei_num = pinlei_num LogInfo.begin_track("Generate Multi-Pinlei Data for evaluation...") fin = codecs.open(self.root_fp + "/query_label.txt." + str(self.pinlei_num), 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/model_data_test." + str(self.pinlei_num) + ".name", 'w', encoding='utf-8') fsho = codecs.open(self.root_fp + "/model_data_test." + str(self.pinlei_num) + ".check", 'w', encoding='utf-8') cnt = 0 not_cover = set() for line in fin: cnt += 1 if cnt % 100000 == 0: LogInfo.logs("%d lines processed.", cnt) fout.flush() spt = line.strip().split("\t") context = spt[1] is_cover = True for i in range(2, 2 + self.pinlei_num): pinlei = "[[" + spt[i] + "]]" if pinlei not in self.pinlei: # LogInfo.logs("%s not cover.", pinlei) is_cover = False not_cover.add(pinlei) if not is_cover: continue if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15: continue for i in range(2, 2 + self.pinlei_num): pinlei = "[[" + spt[i] + "]]" fout.write(context + "\t" + pinlei + "\n") fsho.write(spt[0] + "\n") fin.close() fout.close() fsho.close() LogInfo.end_track("%d pinlei not cover.", len(not_cover))
def prepare_model_data(self): LogInfo.begin_track("Generate model data...") # .1 means single pinlei fin = codecs.open(self.root_fp + "/query_label.txt.1", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/model_data_train.name", 'w', encoding='utf-8') not_cover = 0 not_context = 0 cnt = 0 for line in fin: cnt += 1 if cnt % 100000 == 0: LogInfo.logs("%d lines processed.", cnt) fout.flush() spt = line.strip().split("\t") context = spt[1] pinlei = "[[" + spt[2] + "]]" if pinlei not in self.pinlei: not_cover += 1 continue if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15: not_context += 1 continue fout.write(context + "\t" + pinlei + "\n") negs = self.neg_sample_random(pinlei, 19) for neg in negs: fout.write(context + "\t" + neg + "\n") fin.close() fout.close() LogInfo.end_track("Model data prepared. Size: %d. (%d, %d).", cnt - not_context - not_cover, not_cover, not_context)
def load_vocab(self, vocab_file, embedding_dim, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() self.vocab_embedding.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 # 0 embedding for not-found query term self.vocab_index_dict["[[NULL]]"] = index self.index_vocab_dict.append("[[NULL]]") self.vocab_embedding.append([0.0 for _ in range(embedding_dim)]) index += 1 for line in fin: spt = line.strip().split() self.vocab_index_dict[spt[0]] = index self.index_vocab_dict.append(spt[0]) embedding = [float(spt[i].strip()) for i in range(1, len(spt))] self.vocab_embedding.append(embedding) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = len(self.vocab_embedding) self.vocab_embedding = np.array(self.vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load(self, session, fp): LogInfo.logs("Loading Model from %s", fp) self.saver.restore(session, fp) LogInfo.logs("Model loaded from %s", fp)
def eval_pair(self, setting=1, strategy=1): """ evaluation based on word pairs :param setting: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on Copa using word pairs using setting %d and strategy %d...", setting, strategy) correct = 0 cause = 0 effect = 0 cause_correct = 0 effect_correct = 0 for i in range(500, 1000): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] if ask4 == 'cause': cause += 1 else: effect += 1 q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o') score1 = 0.0 score2 = 0.0 show_list1 = list() show_list2 = list() for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: tmp = self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score1 += tmp show_list1.append("(%s, %s)-->%.2f" % (word1, word2, tmp)) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: tmp = self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score2 += tmp show_list2.append("(%s, %s)-->%.2f" % (word1, word2, tmp)) # LogInfo.logs("[%d] Q: %s", i+1, ' '.join(sentence)) # LogInfo.logs("[%d] O1: %s", i+1, ' '.join(option1)) # LogInfo.logs("[%d] O2: %s", i+1, ' '.join(option2)) # LogInfo.logs("[%d] ask4: [%s].", i+1, ask4) # # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list1)) # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list2)) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 if setting == 3: if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 if setting == 3: if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) if setting == 3: LogInfo.logs( "[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)", float(cause_correct) / cause, cause_correct, cause, float(effect_correct) / effect, effect_correct, effect) LogInfo.end_track()
def word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd, num, setting, norm, ratio, verbose=False): acc = 0 wrong = 0 for i in range(num, 1000): hyp, alt1, alt2 = copa[i] ask, labl = label[i] if verbose: LogInfo.begin_track("step into copa #%d", i+1) LogInfo.logs("q: %s", hyp) LogInfo.logs("o1: %s", alt1) LogInfo.logs("o2: %s", alt2) LogInfo.logs("answer: o%d", labl) # ask for cause if ask == 0: if verbose: LogInfo.begin_track("[ask for cause] o1/o2 -> q") cause, effect = alt1, hyp if verbose: LogInfo.begin_track("o1->q: [%s]->[%s]", cause, effect) score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score1) LogInfo.end_track() cause, effect = alt2, hyp if verbose: LogInfo.begin_track("o2->q: [%s]->[%s]", cause, effect) score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score2) LogInfo.end_track() if score1 > score2 and labl == 1: acc += 1 if verbose: LogInfo.logs("[[correct]]") if score1 < score2 and labl == 2: acc += 1 if verbose: LogInfo.logs("[[correct]]") if verbose: LogInfo.end_track() # ask for effect elif ask == 1: if verbose: LogInfo.begin_track("[ask for effect] q -> o1/o2") cause, effect = hyp, alt1 if verbose: LogInfo.begin_track("q->o1: [%s]->[%s]", cause, effect) score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score1) LogInfo.end_track() cause, effect = hyp, alt2 if verbose: LogInfo.begin_track("q->o2: [%s]->[%s]", cause, effect) score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score2) LogInfo.end_track() if score1 > score2 and labl == 1: acc += 1 if verbose: LogInfo.logs(">>correct<<") elif score1 < score2 and labl == 2: acc += 1 if verbose: LogInfo.logs(">>correct<<") else: wrong += 1 if verbose: LogInfo.logs(">>wrong<<") if verbose: LogInfo.end_track() else: print ask if verbose: LogInfo.logs("[error] ask=%d", ask) if verbose: LogInfo.end_track("end for #%d", i+1) LogInfo.logs("===========") if verbose: LogInfo.logs("status: %dY-%dW/%d", acc, wrong, 1000-num) return acc*1.0/(1000-num)
def eval_pair_lambda(self, lamb=1.0, strategy=1): """ evaluation based on word pairs :param lamb: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on Copa using word pairs using lambda %.2f and strategy %d...", lamb, strategy) correct = 0 cause = 0 effect = 0 cause_correct = 0 effect_correct = 0 for i in range(500, 1000): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] if ask4 == 'cause': cause += 1 else: effect += 1 # left q_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='o') score1a = 0.0 score2a = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1a += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2a += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) # right q_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='o') score1b = 0.0 score2b = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1b += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2b += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score1 = (score1a * lamb) + (score1b * (1 - lamb)) score2 = (score2a * lamb) + (score2b * (1 - lamb)) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.logs("[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)", float(cause_correct) / cause, cause_correct, cause, float(effect_correct) / effect, effect_correct, effect) LogInfo.end_track()
def get(self, key): if key not in self.config_dict: LogInfo.logs("[warning] key [%s] not exists.") return self.config_dict.get(key, None)
def add(self, key, value): if key in self.config_dict: LogInfo.logs("[warning] key already exists [%s: %s], now change to [%s].", key, str(self.config_dict.get(key)), value) self.config_dict[key] = value
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_lem.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = list() option1 = list() option2 = list() for word in raw_sentence.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): sentence.append(word.split(':')[1]) for word in raw_option1.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option1.append(word.split(':')[1]) for word in raw_option2.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option2.append(word.split(':')[1]) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) if os.path.isfile(data_file): LogInfo.begin_track("[Exist] Loading from %s...", data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: spt = line.strip().split("\t") query_idxs.append([int(idx) for idx in spt[0].split(" ")]) query_lens.append(int(spt[1])) labels.append([int(idx) for idx in spt[2].split(" ")]) intents.append(int(spt[3])) link_masks.append([int(idx) for idx in spt[4].split(" ")]) entity_idxs.append([int(idx) for idx in spt[5].split(" ")]) cnt += 1 LogInfo.show_line(cnt, 1000000) LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len) else: txt_data_file = data_file + ".name" LogInfo.begin_track("[Not Exist] Loading from %s...", txt_data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 fout = codecs.open(data_file, 'w', encoding=encoding) with codecs.open(txt_data_file, 'r', encoding=encoding) as fin: for line in fin: query_idx, query_len, label, intent, link_mask, entity_idx\ = self.decode_line(line) fout.write(" ".join([str(x) for x in query_idx]) + "\t" + str(query_len) + "\t" + " ".join([str(x) for x in label]) + "\t" + str(intent) + "\t" + " ".join([str(x) for x in link_mask]) + "\t" + " ".join([str(x) for x in entity_idx]) + "\n") query_idxs.append(query_idx) query_lens.append(query_len) labels.append(label) intents.append(intent) link_masks.append(link_mask) entity_idxs.append(entity_idx) cnt += 1 LogInfo.show_line(cnt, 1000000) fout.close() LogInfo.logs("Write into %s.", data_file) LogInfo.end_track("Max_seq_len = %d.", self.max) self.data = list( zip(query_idxs, query_lens, labels, intents, link_masks, entity_idxs)) self.data_size = len(self.data) LogInfo.end_track("Loaded. Size: %d.", self.data_size)
def _build_graph(self): self.query_idx = tf.placeholder(dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.query_len = tf.placeholder(dtype=tf.int32, shape=[None, ]) self.label = tf.placeholder(dtype=tf.int32, shape=[None, self.config.get("max_seq_len")]) self.batch_size = self.config.get("batch_size") with tf.device('/cpu:0'), tf.name_scope("embedding_layer"): term_embedding = tf.get_variable( name="embedding", shape=[self.config.get("vocab_size"), self.config.get("embedding_dim")], dtype=tf.float32, initializer=tf.constant_initializer(self.embedding_vocab) ) self.query_embedding = tf.nn.embedding_lookup(term_embedding, self.query_idx) # tf.split: Tensor -> list tensors # tf.stack: list of tensors -> one tensor self.query_slice = [ tf.squeeze(_input, [1]) for _input in tf.split(self.query_embedding, self.config.get("max_seq_len"), axis=1) ] # better style: use unstack! one tensor -> list of tensors # equal to the above one # self.query_slice = tf.unstack(self.query_embedding, axis=1) # bi-LSTM with tf.name_scope("rnn_encoder"): rnn_config = dict() key_list = ["cell_class", "num_units", "dropout_input_keep_prob", "dropout_output_keep_prob", "num_layers"] for key in key_list: rnn_config[key] = self.config.get(key) rnn_encoder = BidirectionalRNNEncoder(rnn_config, self.mode) self.biLstm = rnn_encoder.encode(self.query_slice, self.query_len) # output dim = 2 * rnn cell dim (fw + bw) self.hidden_dim = self.config.get("num_units") * 2 self.biLstm_clip = tf.clip_by_value(self.biLstm.attention_values, -self.config.get("grad_clip"), self.config.get("grad_clip")) # training parameters with tf.name_scope("parameters"): self.W_l = tf.get_variable(name="W_l", shape=[self.hidden_dim, self.config.get("label_num")], dtype=tf.float32, initializer =tf.contrib.layers.xavier_initializer(uniform=True)) self.b_l = tf.get_variable(name="b_l", shape=[self.config.get("label_num")], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) # above bi-LSTM self.outputs = tf.reshape(tensor=self.biLstm_clip, shape=[-1, self.hidden_dim]) self.label_matrix = tf.nn.xw_plus_b(self.outputs, self.W_l, self.b_l) # [B, T, label_num] self.logits = tf.reshape(tensor=self.label_matrix, shape=[-1, self.config.get("max_seq_len"), self.config.get("label_num")]) # [label_num, label_num] self.transition_mat = tf.get_variable( "transitions", shape=[self.config.get("label_num")+1, self.config.get("label_num")+1], initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # ===================================== Loss ====================================== # if self.mode == tf.contrib.learn.ModeKeys.TRAIN: # # softmax sequence loss for sequence nlu # self.loss = softmax_sequence_loss(logits=self.logits, # targets=self.label, # sequence_length=self.query_len) # self.loss = tf.reduce_mean(self.loss) # padding logits for crf loss, length += 1 small = -1000.0 start_logits = tf.concat( [small * tf.ones(shape=[self.batch_size, 1, self.config.get("label_num")]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1 ) LogInfo.logs(start_logits.get_shape().as_list()) pad_logits = tf.cast(small * tf.ones([self.batch_size, self.config.get("max_seq_len"), 1]), tf.float32) LogInfo.logs(pad_logits.get_shape().as_list()) self.logits = tf.concat([self.logits, pad_logits], axis=-1) self.logits = tf.concat([start_logits, self.logits], axis=1) LogInfo.logs(self.logits.get_shape().as_list()) targets = tf.concat( [tf.cast(self.config.get("label_num")*tf.ones([self.batch_size, 1]), tf.int32), self.label], axis=-1 ) LogInfo.logs(targets.get_shape().as_list()) # CRF layer self.log_likelihood, self.transition_mat = \ tf.contrib.crf.crf_log_likelihood( inputs=self.logits, tag_indices=targets, transition_params=self.transition_mat, sequence_lengths=self.query_len+1) self.loss = tf.reduce_mean(-self.log_likelihood) # train op self.global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = get_optimizer(self.config.get("optimizer"), self.config.get("lr")) grads_and_vars = optimizer.compute_gradients(self.loss) self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_phr.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = map(lambda x: x.split(':')[1], raw_sentence.strip().split()) option1 = map(lambda x: x.split(':')[1], raw_option1.strip().split()) option2 = map(lambda x: x.split(':')[1], raw_option2.strip().split()) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()