def eval_avg(self, setting=1): """ sentence representation = average of word vectors :return: final acc. """ LogInfo.begin_track( "Eval on Copa using average word representations using setting %d...", setting) correct = 0 for i in range(500, 1000): ask4 = self.copa_ground[i][0] sentence, option1, option2 = self.copa_data[i] sent_vec = self.get_repr(sentence, ask4, setting, 'q') opt1_vec = self.get_repr(option1, ask4, setting, 'o') opt2_vec = self.get_repr(option2, ask4, setting, 'o') score1 = self.get_similarity(sent_vec, opt1_vec) score2 = self.get_similarity(sent_vec, opt2_vec) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.end_track()
def load_pinlei(self): LogInfo.begin_track("Load pinlei names...") with codecs.open("/u01/xusheng/word2vec/vec/yyh_pinlei.txt", 'r', encoding='utf-8') as fin: for line in fin: name = line.strip().split()[0] if name.startswith("[["): self.pinlei.add(name) LogInfo.end_track("Pinlei name loaded. Size: %d.", len(self.pinlei))
def add_pinlei_tag_yyh(self): LogInfo.begin_track("Begin adding tags for pinleis...") fin = codecs.open(self.root_fp + "/yyh_w2v_train.txt", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/yyh_w2v_train.txt.pinlei_tag", 'w', encoding='utf-8') cnt = 0 for line in fin: spt = line.strip().split() new_line = "" i = 0 while i < len(spt): if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[3] if str4 in self.pinlei_set: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if str3 in self.pinlei_set: # LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", # spt[i], spt[i+1], spt[i+2]) new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if str2 in self.pinlei_set: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) new_line += "[[" + str2 + "]] " i += 2 continue if spt[i] in self.pinlei_set: # LogInfo.logs("Found pinlei [%s]", spt[i]) new_line += "[[" + spt[i] + "]] " i += 1 continue new_line += spt[i] + " " i += 1 fout.write(new_line + "\n") cnt += 1 if cnt < 5: LogInfo.logs("res ==> (%s)", new_line) LogInfo.show_line(cnt, 100000) fin.close() fout.close() LogInfo.end_track("Pinlei tags added.")
def eval_pair(self, setting=1, strategy=1): """ evaluation based on word pairs :param setting: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on ROC using word pairs using setting %d and strategy %d...", setting, strategy) correct = 0 for i in range(0, 1871): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o') score1 = 0.0 score2 = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1 += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2 += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 1871, correct, 1871) LogInfo.end_track()
def load_pinlei(self): LogInfo.begin_track("Load pinlei names...") with codecs.open(self.root_fp + "/raw/kg_pinlei_id", 'r', encoding='utf-8') as fin: for line in fin: spt = line.strip().split("\t") if len(spt) < 2: continue pinlei = spt[0] self.pinlei_set.add(pinlei) LogInfo.end_track("%d names loaded.", len(self.pinlei_set))
def load_vocab_embedding(self, embedding_file, encoding): LogInfo.begin_track("Loading embeddings from %s...", embedding_file) vocab_embedding = len(self.vocab_index_dict) * [None] with codecs.open(embedding_file, 'r', encoding=encoding) as fin: count = 0 for line in fin: strs = line.split() embedding = [float(strs[i].strip()) for i in range(1, len(strs))] vocab_embedding[self.vocab_index_dict[strs[0].strip()]] = embedding count += 1 LogInfo.show_line(count, 50000) assert count == len(vocab_embedding) self.vocab_embedding = np.asarray(vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) context_idxs, context_seqs, pinlei_idxs = list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: context_idx, context_seq, pinlei_idx = self.decode_line(line) context_idxs.append(context_idx) context_seqs.append(context_seq) pinlei_idxs.append(pinlei_idx) cnt += 1 LogInfo.show_line(cnt, 10000) self.data = list(zip(context_idxs, context_seqs, pinlei_idxs)) self.data_size = len(self.data) LogInfo.end_track()
def load_vocab_name(self, vocab_file, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 for line in fin: self.vocab_index_dict[line.strip()] = index self.index_vocab_dict.append(line.strip()) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = index LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def main(): copa, worddic = readcopa() label = readlabel() cdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") enegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") cnegdic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") edic = readvec(worddic, "/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") verbose = False import sys mode = sys.argv[1] if mode == 'full': for ratio in range(21): for lamd in range(11): acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, 4, True, ratio*0.1, verbose) print ratio*0.1, lamd*0.1, acc # print "word pair with norm:" # for setting in range(3): # for lamd in range(11): # acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, True, verbose) # print lamd*0.1, setting, acc # print "\nword pair without norm:" # for setting in range(3): # for lamd in range(11): # acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, setting, False, verbose) # print lamd*0.1, setting, acc # # print "\nsentence level with norm:" # for lamd in range(11): # acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, True) # print lamd*0.1, acc # # print "\nsentence level without norm:" # for lamd in range(11): # acc = sen_sen(copa, label, cdic, enegdic, cnegdic, edic, lamd*0.1, 500, False) # print lamd*0.1, acc elif mode == 'case': para1 = float(sys.argv[2]) para2 = int(sys.argv[3]) LogInfo.begin_track("case tracing for word-pair & lambda=%.1f, setting=%d:", para1, para2) verbose = True acc = word_word1(copa, label, cdic, enegdic, cnegdic, edic, para1, 500, para2, True, verbose) LogInfo.logs("[Accuracy] %.4f", acc) LogInfo.end_track()
def eval_avg_lambda(self, lamb=1.0): """ sentence representation = average of word vectors :return: final acc. """ LogInfo.begin_track( "Eval on Copa using average word representations using lambda %.2f...", lamb) correct = 0 for i in range(500, 1000): ask4 = self.copa_ground[i][0] sentence, option1, option2 = self.copa_data[i] sent_vec = self.get_repr(sentence, ask4, 1, 'q') opt1_vec = self.get_repr(option1, ask4, 1, 'o') opt2_vec = self.get_repr(option2, ask4, 1, 'o') score1a = self.get_similarity(sent_vec, opt1_vec) score2a = self.get_similarity(sent_vec, opt2_vec) sent_vec = self.get_repr(sentence, ask4, 2, 'q') opt1_vec = self.get_repr(option1, ask4, 2, 'o') opt2_vec = self.get_repr(option2, ask4, 2, 'o') score1b = self.get_similarity(sent_vec, opt1_vec) score2b = self.get_similarity(sent_vec, opt2_vec) score1 = (score1a * lamb) + (score1b * (1 - lamb)) score2 = (score2a * lamb) + (score2b * (1 - lamb)) # LogInfo.logs("[log] %.4f(%.2f^%.2f*%.2f^%.2f) ||| %.4f(%.2f^%.2f*%.2f^%.2f)", # score1, score1a, lamb, score1b, 1-lamb, # score2, score2a, lamb, score2b, 1-lamb) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.end_track()
def prepare_model_data(self, pinlei_num): self.pinlei_num = pinlei_num LogInfo.begin_track("Generate Multi-Pinlei Data for evaluation...") fin = codecs.open(self.root_fp + "/query_label.txt." + str(self.pinlei_num), 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/model_data_test." + str(self.pinlei_num) + ".name", 'w', encoding='utf-8') fsho = codecs.open(self.root_fp + "/model_data_test." + str(self.pinlei_num) + ".check", 'w', encoding='utf-8') cnt = 0 not_cover = set() for line in fin: cnt += 1 if cnt % 100000 == 0: LogInfo.logs("%d lines processed.", cnt) fout.flush() spt = line.strip().split("\t") context = spt[1] is_cover = True for i in range(2, 2 + self.pinlei_num): pinlei = "[[" + spt[i] + "]]" if pinlei not in self.pinlei: # LogInfo.logs("%s not cover.", pinlei) is_cover = False not_cover.add(pinlei) if not is_cover: continue if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15: continue for i in range(2, 2 + self.pinlei_num): pinlei = "[[" + spt[i] + "]]" fout.write(context + "\t" + pinlei + "\n") fsho.write(spt[0] + "\n") fin.close() fout.close() fsho.close() LogInfo.end_track("%d pinlei not cover.", len(not_cover))
def load_configs(fp): LogInfo.begin_track('Loading config from %s: ', fp) config_dict = {} with open(fp, 'r') as br: for line in br.readlines(): line = line.strip() if line.startswith('#') or line == '': continue if line.find('\t') == -1: continue spt = line.split('\t') if len(spt) < 3: LogInfo.logs("[%s] is invalid, pls add type!", line) continue k = spt[0] v_str = spt[1] t = spt[2] if t == "d" or t == "int": config_dict[k] = int(v_str) elif t == "f" or t == "float" or t == "double": config_dict[k] = float(v_str) elif t == "b" or t == "bool": if v_str == "true" or v_str == "True" \ or v_str == "TRUE" or v_str == "1": config_dict[k] = True else: config_dict[k] = False elif t == "tf" or t == "tensorflow": if v_str == 'relu': config_dict[k] = tf.nn.relu elif v_str == 'sigmoid': config_dict[k] = tf.nn.sigmoid elif v_str == 'tanh': config_dict[k] = tf.nn.tanh elif t == "None" or v_str == "None": config_dict[k] = None else: config_dict[k] = v_str LogInfo.logs('%s = %s', k, v_str) LogInfo.end_track() return config_dict
def prepare_model_data(self): LogInfo.begin_track("Generate model data...") # .1 means single pinlei fin = codecs.open(self.root_fp + "/query_label.txt.1", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/model_data_train.name", 'w', encoding='utf-8') not_cover = 0 not_context = 0 cnt = 0 for line in fin: cnt += 1 if cnt % 100000 == 0: LogInfo.logs("%d lines processed.", cnt) fout.flush() spt = line.strip().split("\t") context = spt[1] pinlei = "[[" + spt[2] + "]]" if pinlei not in self.pinlei: not_cover += 1 continue if len(spt[1].split(" ")) < 6 or len(spt[1].split(" ")) > 15: not_context += 1 continue fout.write(context + "\t" + pinlei + "\n") negs = self.neg_sample_random(pinlei, 19) for neg in negs: fout.write(context + "\t" + neg + "\n") fin.close() fout.close() LogInfo.end_track("Model data prepared. Size: %d. (%d, %d).", cnt - not_context - not_cover, not_cover, not_context)
def load_vocab(self, vocab_file, embedding_dim, encoding): LogInfo.begin_track("Loading vocab from %s...", vocab_file) self.vocab_size = 0 self.index_vocab_dict.clear() self.vocab_index_dict.clear() self.vocab_embedding.clear() with codecs.open(vocab_file, 'r', encoding=encoding) as fin: index = 0 # 0 embedding for not-found query term self.vocab_index_dict["[[NULL]]"] = index self.index_vocab_dict.append("[[NULL]]") self.vocab_embedding.append([0.0 for _ in range(embedding_dim)]) index += 1 for line in fin: spt = line.strip().split() self.vocab_index_dict[spt[0]] = index self.index_vocab_dict.append(spt[0]) embedding = [float(spt[i].strip()) for i in range(1, len(spt))] self.vocab_embedding.append(embedding) index += 1 LogInfo.show_line(index, 50000) self.vocab_size = len(self.vocab_embedding) self.vocab_embedding = np.array(self.vocab_embedding) LogInfo.end_track("Vocab loaded. Size: %d.", self.vocab_size)
def load(self, data_file, encoding): LogInfo.begin_track("Loading data from %s...", data_file) if os.path.isfile(data_file): LogInfo.begin_track("[Exist] Loading from %s...", data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 with codecs.open(data_file, 'r', encoding=encoding) as fin: for line in fin: spt = line.strip().split("\t") query_idxs.append([int(idx) for idx in spt[0].split(" ")]) query_lens.append(int(spt[1])) labels.append([int(idx) for idx in spt[2].split(" ")]) intents.append(int(spt[3])) link_masks.append([int(idx) for idx in spt[4].split(" ")]) entity_idxs.append([int(idx) for idx in spt[5].split(" ")]) cnt += 1 LogInfo.show_line(cnt, 1000000) LogInfo.end_track("Max_seq_len = %d.", self.max_seq_len) else: txt_data_file = data_file + ".name" LogInfo.begin_track("[Not Exist] Loading from %s...", txt_data_file) query_idxs, query_lens, labels, intents, link_masks, entity_idxs \ = list(), list(), list(), list(), list(), list() cnt = 0 fout = codecs.open(data_file, 'w', encoding=encoding) with codecs.open(txt_data_file, 'r', encoding=encoding) as fin: for line in fin: query_idx, query_len, label, intent, link_mask, entity_idx\ = self.decode_line(line) fout.write(" ".join([str(x) for x in query_idx]) + "\t" + str(query_len) + "\t" + " ".join([str(x) for x in label]) + "\t" + str(intent) + "\t" + " ".join([str(x) for x in link_mask]) + "\t" + " ".join([str(x) for x in entity_idx]) + "\n") query_idxs.append(query_idx) query_lens.append(query_len) labels.append(label) intents.append(intent) link_masks.append(link_mask) entity_idxs.append(entity_idx) cnt += 1 LogInfo.show_line(cnt, 1000000) fout.close() LogInfo.logs("Write into %s.", data_file) LogInfo.end_track("Max_seq_len = %d.", self.max) self.data = list( zip(query_idxs, query_lens, labels, intents, link_masks, entity_idxs)) self.data_size = len(self.data) LogInfo.end_track("Loaded. Size: %d.", self.data_size)
def eval_seq_crf_with_o_atis(y_pred_, y_true_, method='precision'): """ Evaluation for ATIS dataset, including "Outside" under specific conditions (3-class) :param y_pred_: [B, T, ] :param y_true_: [B, T, ] :param method: precision/ recall :return: f1 score """ # LogInfo.logs("y_pred: %s", '\n'.join([str(x) for x in y_pred_])) # LogInfo.logs("y_true: %s", '\n'.join([str(x) for x in y_true_])) tag_dict = [ 'O', 'B-day_number', 'B-stoploc.state_code', 'B-toloc.state_code', 'B-time_relative', 'B-fromloc.state_code', 'B-stoploc.airport_code', 'B-airline_code', 'B-connect', 'B-depart_time.period_mod', 'B-flight', 'B-arrive_time.period_mod', 'B-booking_class', 'B-month_name', 'B-return_date.day_name', 'B-depart_date.month_name', 'B-arrive_date.today_relative', 'B-return_time.period_of_day', 'B-aircraft_code', 'B-arrive_date.date_relative', 'B-state_code', 'B-days_code', 'B-airport_code', 'B-period_of_day', 'B-arrive_date.day_name', 'B-flight_days', 'B-return_time.period_mod', 'B-fromloc.airport_code', 'B-arrive_date.month_name', 'B-mod', 'B-stoploc.airport_name', 'B-compartment', 'B-toloc.airport_code', 'B-depart_date.date_relative', 'B-day_name', 'B-or', 'B-depart_date.year', 'B-depart_date.day_name', 'B-toloc.country_name', 'B-return_date.month_name', 'B-meal', 'B-stoploc.city_name', 'I-stoploc.city_name', 'B-round_trip', 'I-round_trip', 'B-state_name', 'I-state_name', 'B-fromloc.city_name', 'I-fromloc.city_name', 'B-airline_name', 'I-airline_name', 'B-flight_stop', 'I-flight_stop', 'B-fromloc.airport_name', 'I-fromloc.airport_name', 'B-arrive_time.start_time', 'I-arrive_time.start_time', 'B-cost_relative', 'I-cost_relative', 'B-city_name', 'I-city_name', 'B-arrive_time.end_time', 'I-arrive_time.end_time', 'B-meal_code', 'I-meal_code', 'B-depart_date.day_number', 'I-depart_date.day_number', 'B-meal_description', 'I-meal_description', 'B-arrive_time.time', 'I-arrive_time.time', 'B-depart_date.today_relative', 'I-depart_date.today_relative', 'B-fare_amount', 'I-fare_amount', 'B-airport_name', 'I-airport_name', 'B-flight_time', 'I-flight_time', 'B-flight_number', 'I-flight_number', 'B-toloc.airport_name', 'I-toloc.airport_name', 'B-flight_mod', 'I-flight_mod', 'B-depart_time.time_relative', 'I-depart_time.time_relative', 'B-return_date.date_relative', 'I-return_date.date_relative', 'B-economy', 'I-economy', 'B-class_type', 'I-class_type', 'B-toloc.state_name', 'I-toloc.state_name', 'B-arrive_time.period_of_day', 'I-arrive_time.period_of_day', 'B-toloc.city_name', 'I-toloc.city_name', 'B-depart_time.start_time', 'I-depart_time.start_time', 'B-return_date.day_number', 'I-return_date.day_number', 'B-today_relative', 'I-today_relative', 'B-depart_time.end_time', 'I-depart_time.end_time', 'B-fromloc.state_name', 'I-fromloc.state_name', 'B-depart_time.time', 'I-depart_time.time', 'B-return_date.today_relative', 'I-return_date.today_relative', 'B-fare_basis_code', 'I-fare_basis_code', 'B-arrive_date.day_number', 'I-arrive_date.day_number', 'B-restriction_code', 'I-restriction_code', 'B-transport_type', 'I-transport_type', 'B-time', 'I-time', 'B-arrive_time.time_relative', 'I-arrive_time.time_relative', 'B-depart_time.period_of_day', 'I-depart_time.period_of_day' ] LogInfo.begin_track("Eval seq %s on %d tags...", method, len(tag_dict)) if method == 'precision': y_pred = np.array(y_pred_) y_true = np.array(y_true_) elif method == 'recall': y_pred = np.array(y_true_) y_true = np.array(y_pred_) names = set() for tag in tag_dict: if tag == 'O': names.add('O') else: names.add(tag[2:]) LogInfo.logs("%d different terms", len(names)) correct = dict() act_cnt = dict() for name in names: correct[name] = 0 act_cnt[name] = 0 for line_pred, line_true in zip(y_pred, y_true): i = 0 cnt = len(line_pred) while i < cnt: tag_num = line_pred[i] tag = tag_dict[tag_num] if tag_num <= 40: # tags with "B" without "I", including "O" if tag_num == 0: kind = 'O' else: kind = tag[2:] act_cnt[kind] += 1 if line_true[i] == line_pred[i]: correct[kind] += 1 i += 1 continue else: kind = tag[2:] sign = tag[0] if sign == 'B': j = i + 1 while j < cnt: next_tag = tag_dict[line_pred[j]] if next_tag[2:] == kind and next_tag[0] == 'I': j += 1 else: break else: i += 1 continue act_cnt[kind] += 1 act_label = ' '.join([str(x) for x in line_true[i:j]]) proposed_label = ' '.join([str(x) for x in line_pred[i:j]]) if act_label == proposed_label and ( j == cnt or line_true[j] != line_true[i] + 1): correct[kind] += 1 i = j ret = dict() keys = act_cnt.keys() correct_total = 0 cnt_total = 0 for key in keys: if act_cnt[key] == 0: ret[key] = 0.0 else: ret[key] = correct[key] * 1.0 / act_cnt[key] LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key], act_cnt[key]) correct_total += correct[key] cnt_total += act_cnt[key] if cnt_total == 0: overall = 0.0 else: overall = correct_total * 1.0 / cnt_total LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total, cnt_total) LogInfo.end_track() return overall
def process_query(self): LogInfo.begin_track("Begin adding tags for queries...") fin = codecs.open(self.root_fp + "/query.txt", 'r', encoding='utf-8') fout = codecs.open(self.root_fp + "/query_label.txt", 'w', encoding='utf-8') cnt = 0 for line in fin: spt = line.strip().split() new_line = "" context = "" label = set() i = 0 while i < len(spt): if i + 4 < len(spt): str5 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] + spt[i + 4] if str5 in self.pinlei_set: LogInfo.logs("Found 5-term pinlei [%s|%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3], spt[i + 4]) label.add(str5) new_line += "[[" + str5 + "]] " i += 5 continue if i + 3 < len(spt): str4 = spt[i] + spt[i + 1] + spt[i + 2] + spt[i + 3] if str4 in self.pinlei_set: LogInfo.logs("Found 4-term pinlei [%s|%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2], spt[i + 3]) label.add(str4) new_line += "[[" + str4 + "]] " i += 4 continue if i + 2 < len(spt): str3 = spt[i] + spt[i + 1] + spt[i + 2] if str3 in self.pinlei_set: LogInfo.logs("Found 3-term pinlei [%s|%s|%s]", spt[i], spt[i + 1], spt[i + 2]) label.add(str3) new_line += "[[" + str3 + "]] " i += 3 continue if i + 1 < len(spt): str2 = spt[i] + spt[i + 1] if str2 in self.pinlei_set: # LogInfo.logs("Found 2-term pinlei [%s|%s]", # spt[i], spt[i+1]) label.add(str2) new_line += "[[" + str2 + "]] " i += 2 continue if spt[i] in self.pinlei_set: # LogInfo.logs("Found pinlei [%s]", spt[i]) label.add(spt[i]) new_line += "[[" + spt[i] + "]] " i += 1 continue context += spt[i] + " " new_line += spt[i] + " " i += 1 if len(label) != 0: ret = new_line.strip() + "\t" + \ context.strip() + "\t" + \ "\t".join(label) + "\n" else: ret = new_line.strip() + "\n" fout.write(ret) cnt += 1 if cnt < 5: LogInfo.logs("res ==> (%s)", ret.strip()) LogInfo.show_line(cnt, 100000) fin.close() fout.close() LogInfo.end_track("Query processed.")
def eval_pair_lambda(self, lamb=1.0, strategy=1): """ evaluation based on word pairs :param lamb: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on Copa using word pairs using lambda %.2f and strategy %d...", lamb, strategy) correct = 0 cause = 0 effect = 0 cause_correct = 0 effect_correct = 0 for i in range(500, 1000): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] if ask4 == 'cause': cause += 1 else: effect += 1 # left q_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=1, role='o') score1a = 0.0 score2a = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1a += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2a += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) # right q_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=2, role='o') score1b = 0.0 score2b = 0.0 for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: score1b += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: score2b += self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score1 = (score1a * lamb) + (score1b * (1 - lamb)) score2 = (score2a * lamb) + (score2b * (1 - lamb)) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f>%.2f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.2f<%.2f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) LogInfo.logs("[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)", float(cause_correct) / cause, cause_correct, cause, float(effect_correct) / effect, effect_correct, effect) LogInfo.end_track()
def eval_seq_softmax(raw_score, y_true, method='precision'): """ Evaluation for sequence labeling under specific conditions (3-class) :param raw_score: [B, T, class_dim] :param y_true: [B, T, ] :param method: precision/ recall :return: f1 score """ tag_dict = ['O', 'PL_B', 'PL_I', 'PK_B', 'PK_I', 'PV_B', 'PV_I'] LogInfo.begin_track("Eval seq %s...", method) if method == 'precision': y_pred = np.argmax(raw_score, axis=1).reshape((-1)) y_true = np.array(y_true).reshape((-1)) elif method == 'recall': y_pred = np.array(y_true).reshape((-1)) y_true = np.argmax(raw_score, axis=1).reshape((-1)) # LogInfo.logs("y_pred: [%s]", ' '.join([str(x) for x in y_pred])) # LogInfo.logs("y_true: [%s]", ' '.join([str(x) for x in y_true])) # LogInfo.logs("y_pred: [%s]", y_pred) # LogInfo.logs("y_true: [%s]", y_true) cnt = len(y_pred) i = 0 correct = {'PL': 0, 'PK': 0, 'PV': 0} act_cnt = {'PL': 0, 'PK': 0, 'PV': 0} while i < cnt: tag_num = y_pred[i] tag = tag_dict[tag_num] if tag == 'O': i += 1 continue else: kind = tag[:2] sign = tag[3] if sign == 'B': j = i + 1 while j < cnt: next_tag = tag_dict[y_pred[j]] if next_tag[:2] == kind and next_tag[3] == 'I': j += 1 else: break else: i += 1 continue act_cnt[kind] += 1 act_label = ' '.join([str(x) for x in y_true[i:j]]) proposed_label = ' '.join([str(x) for x in y_pred[i:j]]) if act_label == proposed_label and (j == cnt or y_true[j] != y_true[i] + 1): correct[kind] += 1 i = j ret = dict() keys = act_cnt.keys() correct_total = 0 cnt_total = 0 for key in keys: if act_cnt[key] == 0: ret[key] = 0.0 else: ret[key] = correct[key] * 1.0 / act_cnt[key] LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key], act_cnt[key]) correct_total += correct[key] cnt_total += act_cnt[key] if cnt_total == 0: overall = 0.0 else: overall = correct_total * 1.0 / cnt_total LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total, cnt_total) LogInfo.end_track() return overall
def eval_pair(self, setting=1, strategy=1): """ evaluation based on word pairs :param setting: :param strategy: 1: sum, 2: /T1+T2, 3: /T1*T2 :return: final acc. """ LogInfo.begin_track( "Eval on Copa using word pairs using setting %d and strategy %d...", setting, strategy) correct = 0 cause = 0 effect = 0 cause_correct = 0 effect_correct = 0 for i in range(500, 1000): sentence, option1, option2 = self.copa_data[i] ask4 = self.copa_ground[i][0] if ask4 == 'cause': cause += 1 else: effect += 1 q_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='q') o_vec_map = self.get_vec_map(ask4=ask4, setting=setting, role='o') score1 = 0.0 score2 = 0.0 show_list1 = list() show_list2 = list() for word1 in sentence: for word2 in option1: if word1 in q_vec_map and word2 in o_vec_map: tmp = self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score1 += tmp show_list1.append("(%s, %s)-->%.2f" % (word1, word2, tmp)) for word1 in sentence: for word2 in option2: if word1 in q_vec_map and word2 in o_vec_map: tmp = self.get_similarity(q_vec_map[word1], o_vec_map[word2]) score2 += tmp show_list2.append("(%s, %s)-->%.2f" % (word1, word2, tmp)) # LogInfo.logs("[%d] Q: %s", i+1, ' '.join(sentence)) # LogInfo.logs("[%d] O1: %s", i+1, ' '.join(option1)) # LogInfo.logs("[%d] O2: %s", i+1, ' '.join(option2)) # LogInfo.logs("[%d] ask4: [%s].", i+1, ask4) # # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list1)) # LogInfo.logs("[%d] %s.", i+1, " | ".join(show_list2)) if strategy == 2: score1 /= (len(sentence) + len(option1)) score2 /= (len(sentence) + len(option2)) elif strategy == 3: score1 /= (len(sentence) * len(option1)) score2 /= (len(sentence) * len(option2)) truth = self.copa_ground[i][1] if score1 > score2: if truth == 1: # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [T]", i+1, 1, score1, score2, truth) correct += 1 if setting == 3: if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.4f>%.4f), truth: %d. [F]", i+1, 1, score1, score2, truth) else: if truth == 2: # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [T]", i+1, 2, score1, score2, truth) correct += 1 if setting == 3: if ask4 == 'cause': cause_correct += 1 else: effect_correct += 1 # else: # LogInfo.logs("[%d] ret: %d(%.4f<%.4f), truth: %d. [F]", i+1, 2, score1, score2, truth) LogInfo.logs("[summary] accuracy: %.4f(%d/%d).", float(correct) / 500, correct, 500) if setting == 3: LogInfo.logs( "[summary] cause/effect acc.: %.4f(%d/%d)/%.4f(%d/%d)", float(cause_correct) / cause, cause_correct, cause, float(effect_correct) / effect, effect_correct, effect) LogInfo.end_track()
def eval_seq_crf(y_pred_, y_true_, method='precision'): """ Evaluation for sequence labeling, without "Outside" under specific conditions (3-class) :param y_pred_: [B, T, ] :param y_true_: [B, T, ] :param method: precision/ recall :return: f1 score """ # LogInfo.logs("y_pred: %s", '\n'.join([str(x) for x in y_pred_])) # LogInfo.logs("y_true: %s", '\n'.join([str(x) for x in y_true_])) tag_dict = ['O', 'PL_B', 'PL_I', 'PK_B', 'PK_I', 'PV_B', 'PV_I'] LogInfo.begin_track("Eval seq %s...", method) if method == 'precision': y_pred = np.array(y_pred_) y_true = np.array(y_true_) elif method == 'recall': y_pred = np.array(y_true_) y_true = np.array(y_pred_) correct = {'PL': 0, 'PK': 0, 'PV': 0} act_cnt = {'PL': 0, 'PK': 0, 'PV': 0} for line_pred, line_true in zip(y_pred, y_true): i = 0 cnt = len(line_pred) while i < cnt: tag_num = line_pred[i] tag = tag_dict[tag_num] if tag == 'O': i += 1 continue else: kind = tag[:2] sign = tag[3] if sign == 'B': j = i + 1 while j < cnt: next_tag = tag_dict[line_pred[j]] if next_tag[:2] == kind and next_tag[3] == 'I': j += 1 else: break else: i += 1 continue act_cnt[kind] += 1 act_label = ' '.join([str(x) for x in line_true[i:j]]) proposed_label = ' '.join([str(x) for x in line_pred[i:j]]) if act_label == proposed_label and ( j == cnt or line_true[j] != line_true[i] + 1): correct[kind] += 1 i = j ret = dict() keys = act_cnt.keys() correct_total = 0 cnt_total = 0 for key in keys: if act_cnt[key] == 0: ret[key] = 0.0 else: ret[key] = correct[key] * 1.0 / act_cnt[key] LogInfo.logs("%s : %.4f(%d/%d)", key, ret[key], correct[key], act_cnt[key]) correct_total += correct[key] cnt_total += act_cnt[key] if cnt_total == 0: overall = 0.0 else: overall = correct_total * 1.0 / cnt_total LogInfo.logs("Over-all %s: %.4f(%d/%d)", method, overall, correct_total, cnt_total) LogInfo.end_track() return overall
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/sync_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syneneg_wdFin_iter200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/syncneg_wdFin_iter200.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/syne_wdFin_iter200.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] try: vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece except ValueError: LogInfo.logs("[error] %s | %s", sptc[0:3], spte[0:3]) continue LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_phr.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = map(lambda x: x.split(':')[1], raw_sentence.strip().split()) option1 = map(lambda x: x.split(':')[1], raw_option1.strip().split()) option2 = map(lambda x: x.split(':')[1], raw_option2.strip().split()) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()
config.add("vocab_size", vocab_loader.vocab_size) LogInfo.logs("Embedding shape: %s.", vocab_loader.vocab_embedding.shape) data_loader = DataLoader(config.get("max_seq_len"), vocab_loader.vocab_index_dict) data_loader.load(config.get("test_data_fp"), 'utf-8') LogInfo.begin_track("Create models...") graph = tf.Graph() with graph.as_default(): test_model = IntentionIdentifier( config=config, mode=tf.contrib.learn.ModeKeys.EVAL, embedding_vocab=vocab_loader.vocab_embedding) LogInfo.logs("Test model created.") LogInfo.end_track() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True LogInfo.begin_track("Start testing...") with tf.Session(graph=graph, config=tf_config) as session: test_model.load(session, root_path + "/model/best_model") batch_size = config.get("test_batch_size") batch_num = int(data_loader.data_size / batch_size) score_list = list() for i in range(batch_num + 1): LogInfo.logs("Testing batch %d...", i + 1) if (i + 1) * batch_size > data_loader.data_size: test_data_batch = data_loader.data[i * batch_size:data_loader. data_size]
LogInfo.logs("%d pinlei pairs loaded.", len(pinlei_pairs)) # data transformer data_feeder = DataLoader(config.get("max_seq_len"), vocab_loader.vocab_index_dict) # create model LogInfo.begin_track("Create models...") graph = tf.Graph() with graph.as_default(): test_model = IntentionIdentifier( config=config, mode=tf.contrib.learn.ModeKeys.TRAIN, embedding_vocab=vocab_loader.vocab_embedding) LogInfo.logs("Test model created.") LogInfo.end_track() # tensorflow configuration tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # testing started LogInfo.begin_track("Start testing...") with tf.Session(graph=graph, config=tf_config) as session: test_model.load(session, root_path + "/model/best_model") cnt = 0 while True: LogInfo.logs("Please input your segmented (split by \' \') query:") query = input() LogInfo.begin_track("\nQuery: %s", query) if query == "exit" or query == "q":
def word_word1(copa, label, cdic, enegdic, cnegdic, edic, lamd, num, setting, norm, ratio, verbose=False): acc = 0 wrong = 0 for i in range(num, 1000): hyp, alt1, alt2 = copa[i] ask, labl = label[i] if verbose: LogInfo.begin_track("step into copa #%d", i+1) LogInfo.logs("q: %s", hyp) LogInfo.logs("o1: %s", alt1) LogInfo.logs("o2: %s", alt2) LogInfo.logs("answer: o%d", labl) # ask for cause if ask == 0: if verbose: LogInfo.begin_track("[ask for cause] o1/o2 -> q") cause, effect = alt1, hyp if verbose: LogInfo.begin_track("o1->q: [%s]->[%s]", cause, effect) score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score1) LogInfo.end_track() cause, effect = alt2, hyp if verbose: LogInfo.begin_track("o2->q: [%s]->[%s]", cause, effect) score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score2) LogInfo.end_track() if score1 > score2 and labl == 1: acc += 1 if verbose: LogInfo.logs("[[correct]]") if score1 < score2 and labl == 2: acc += 1 if verbose: LogInfo.logs("[[correct]]") if verbose: LogInfo.end_track() # ask for effect elif ask == 1: if verbose: LogInfo.begin_track("[ask for effect] q -> o1/o2") cause, effect = hyp, alt1 if verbose: LogInfo.begin_track("q->o1: [%s]->[%s]", cause, effect) score1 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score1) LogInfo.end_track() cause, effect = hyp, alt2 if verbose: LogInfo.begin_track("q->o2: [%s]->[%s]", cause, effect) score2 = cal_score(cause, effect, cdic, enegdic, cnegdic, edic, lamd, setting, norm, ratio, verbose) if verbose: LogInfo.logs("final score: %.4f", score2) LogInfo.end_track() if score1 > score2 and labl == 1: acc += 1 if verbose: LogInfo.logs(">>correct<<") elif score1 < score2 and labl == 2: acc += 1 if verbose: LogInfo.logs(">>correct<<") else: wrong += 1 if verbose: LogInfo.logs(">>wrong<<") if verbose: LogInfo.end_track() else: print ask if verbose: LogInfo.logs("[error] ask=%d", ask) if verbose: LogInfo.end_track("end for #%d", i+1) LogInfo.logs("===========") if verbose: LogInfo.logs("status: %dY-%dW/%d", acc, wrong, 1000-num) return acc*1.0/(1000-num)
with graph.as_default(): train_model = IntentionIdentifier( config=config, mode=tf.contrib.learn.ModeKeys.TRAIN, embedding_vocab=vocab_loader.vocab_embedding) LogInfo.logs("Train model created.") # all get_variable parameters will be reused in eval tf.get_variable_scope().reuse_variables() eval_model = IntentionIdentifier( config=config, mode=tf.contrib.learn.ModeKeys.EVAL, embedding_vocab=vocab_loader.vocab_embedding) LogInfo.logs("Eval model created.") LogInfo.end_track() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True best_valid_acc = 0.0 waiting = 0 LogInfo.begin_track("Start training...") with tf.Session(graph=graph, config=tf_config) as session: session.run(tf.global_variables_initializer()) for epoch in range(config.get("epoch")): LogInfo.begin_track("Epoch %d/%d...", epoch, config.get("epoch")) batch_generator.reset_batch_pointer() for batch in range(batch_generator.num_batches): batch_data = batch_generator.next_batch()
def load_data(self): """ load data from files :return: """ LogInfo.begin_track("Loading data...") # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: with open("/home/yuchen/CppFiles/Causal/copy_sync_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syneneg_half_200_iter100.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync[wordc] = vecc self.syne_neg[worde] = vece LogInfo.logs("[log] sync/syneneg cause/effect vectors loaded (%d/%d).", len(self.sync), len(self.syne_neg)) with open("/home/yuchen/CppFiles/Causal/copy_syncneg_half_200_iter100.txt") as finc, \ open("/home/yuchen/CppFiles/Causal/copy_syne_half_200_iter100.txt") as fine: # with open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as finc, \ # open("/home/yuchen/CppFiles/Causal/syn0_w2v_200.txt") as fine: cnt = 0 for linec, linee in zip(finc, fine): cnt += 1 LogInfo.show_line(cnt, 100000) sptc = linec.strip().split() spte = linee.strip().split() wordc = sptc[0] worde = spte[0] vecc = map(lambda x: float(x), sptc[1:]) vece = map(lambda x: float(x), spte[1:]) self.sync_neg[wordc] = vecc self.syne[worde] = vece LogInfo.logs("[log] syncneg/syne cause/effect vectors loaded (%d/%d).", len(self.sync_neg), len(self.syne)) # NN, JJ, VB with open("/home/yuchen/data/copa_lem.txt") as fin: for i in range(1000): raw_sentence = fin.readline() raw_option1 = fin.readline() raw_option2 = fin.readline() sentence = list() option1 = list() option2 = list() for word in raw_sentence.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): sentence.append(word.split(':')[1]) for word in raw_option1.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option1.append(word.split(':')[1]) for word in raw_option2.strip().split(): if word.startswith('NN') or word.startswith( 'JJ') or word.startswith('VB'): option2.append(word.split(':')[1]) self.copa_data.append([sentence, option1, option2]) LogInfo.logs("[log] copa dataset loaded (%d).", len(self.copa_data)) with open("/home/yuchen/data/copa_label.txt") as fin: for line in fin: spt = line.strip().split('\t') self.copa_ground.append([spt[1], int(spt[2])]) LogInfo.logs("[log] copa ground truth loaded (%d).", len(self.copa_ground)) LogInfo.end_track()
LogInfo.begin_track("Loading wiki-fb entity map...") wiki_fb_map = dict() cnt = 0 with open(fb_path + "/GS-cleanWiki-triple.txt") as fin: for line in fin: spt = line.strip().split('\t') if len(spt) < 3: continue fb_ent = spt[0] wiki_ent = spt[2].split('/wiki/')[1][:-1] wiki_ent = wiki_ent.lower().replace('_', ' ') wiki_fb_map[wiki_ent] = fb_ent cnt += 1 LogInfo.show_line(cnt, 500000) LogInfo.end_track("%d pairs in total", cnt) LogInfo.begin_track("Loading fb entity pop...") fb_ent_pop_map = dict() cnt = 0 with open("/home/xusheng/freebase/top5m.mid") as fin: for line in fin: spt = line.strip().split('\t') if len(spt) < 2: continue ent = spt[0] pop = int(spt[1]) fb_ent_pop_map[ent] = pop cnt += 1 LogInfo.show_line(cnt, 500000) LogInfo.end_track("%d entities in total", cnt)