def collect_env(): global RESULTS_AIM global RESULTS_FQ RESULTS_FQ = {} LINE_NUM = 0 with open(YLP_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" % (LINE_NUM)) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM)) line_x = line.split() len_x = len(line_x) for i in range(len_x): if line_x[i] in RESULTS_AIM: c_id = get_term_id(line_x[i]) if (i - 1) >= 0: w_id_p = get_term_id(line_x[i - 1]) else: w_id_p = get_term_id(PADDING) if (i + 1) < len_x: w_id_n = get_term_id(line_x[i + 1]) else: w_id_n = get_term_id(PADDING) append_to_fq(c_id, w_id_p, w_id_n) if (i - 2) >= 0: w_id_p = get_term_id(line_x[i - 2]) w_id_n = get_term_id(line_x[i - 1]) append_to_fq(c_id, w_id_p, w_id_n) if (i + 2) < len_x: w_id_p = get_term_id(line_x[i + 1]) w_id_n = get_term_id(line_x[i + 2]) append_to_fq(c_id, w_id_p, w_id_n) return
def collect_env(): global RESULTS_AIM global RESULTS_FQ RESULTS_FQ = {} LINE_NUM = 0 with open(YLP_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM)) line_x = line.split() len_x = len(line_x) for i in range(len_x): if line_x[i] in RESULTS_AIM: c_id = get_term_id(line_x[i]) if (i-1) >= 0: w_id_p = get_term_id(line_x[i-1]) else: w_id_p = get_term_id(PADDING) if (i+1) < len_x: w_id_n = get_term_id(line_x[i+1]) else: w_id_n = get_term_id(PADDING) append_to_fq(c_id, w_id_p, w_id_n) if (i-2) >= 0: w_id_p = get_term_id(line_x[i-2]) w_id_n = get_term_id(line_x[i-1]) append_to_fq(c_id, w_id_p, w_id_n) if (i+2) < len_x: w_id_p = get_term_id(line_x[i+1]) w_id_n = get_term_id(line_x[i+2]) append_to_fq(c_id, w_id_p, w_id_n) return
def append_to_fq(c_id, w_id_p, w_id_n): global RESULTS_FQ if w_id_p == w_id_n and w_id_p == get_term_id(PADDING): return tmp_id = w_id_p << TAG_SHIFT | w_id_n if tmp_id not in RESULTS_FQ: RESULTS_FQ[tmp_id] = {} RESULTS_FQ[tmp_id][c_id] = 1 else: if c_id not in RESULTS_FQ[tmp_id]: RESULTS_FQ[tmp_id][c_id] = 1 else: RESULTS_FQ[tmp_id][c_id] += 1 return
def predict_one_shot(str_tst, aimword): global RESULTS_WS global RESULTS_FQ global RESULTS_STOP PADDING_ID = get_term_id(PADDING) str_tst = str_tst.strip() seg_list = pynlpir.segment(str_tst, pos_tagging=False) len_s = len(seg_list) if aimword not in seg_list or len(seg_list) < 5: return None i = seg_list.index(aimword) # 准备周围词 score_list = [] if (i - 1) >= 0: env_pre = get_term_id(seg_list[i - 1]) else: env_pre = get_term_id(PADDING) if (i + 1) < len_s: env_nex = get_term_id(seg_list[i + 1]) else: env_nex = get_term_id(PADDING) if env_pre == env_nex and env_pre == get_term_id(PADDING): return "_UNDEFINE_" if env_pre == PADDING_ID and seg_list[i + 1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i - 1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i - 2) >= 0: env_pre = get_term_id(seg_list[i - 2]) env_nex = get_term_id(seg_list[i - 1]) if env_pre == PADDING_ID and seg_list[i - 1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i - 2] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i + 2) < len_s: env_pre = get_term_id(seg_list[i + 1]) env_nex = get_term_id(seg_list[i + 2]) if env_pre == PADDING_ID and seg_list[i + 2] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i + 1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) # Predict it! scores = {} for item in RESULTS_WS[seg_list[i]].keys(): scores[item] = 0 hit = 0 for s_ls in score_list: if s_ls not in RESULTS_FQ: continue for w_s in RESULTS_WS[seg_list[i]][item]: if w_s == seg_list[i]: continue if get_term_id(w_s) in RESULTS_FQ[s_ls]: scores[item] += (RESULTS_FQ[s_ls][get_term_id(w_s)] / sum(RESULTS_FQ[s_ls].values())) hit += 1 scores[item] = (scores[item] * hit) / len( RESULTS_WS[seg_list[i]][item]) #scores[item] = ( hit) / len(RESULTS_WS[seg_list[i]][item]) best_scores = sorted(scores.items(), key=lambda e: e[1], reverse=True) #print(best_scores) if best_scores[0][1] != 0: return best_scores[0][0] else: return "_UNDEFINE_"
def predict_sent(str_tst): global RESULTS_WS global RESULTS_FQ global RESULTS_STOP PADDING_ID = get_term_id(PADDING) str_tst = str_tst.strip() seg_list = pynlpir.segment(str_tst, pos_tagging=False) len_s = len(seg_list) for i in range(len_s): if seg_list[i] not in RESULTS_WS: print(seg_list[i], end=' ') else: # 准备周围词 score_list = [] if (i - 1) >= 0: env_pre = get_term_id(seg_list[i - 1]) else: env_pre = get_term_id(PADDING) if (i + 1) < len_s: env_nex = get_term_id(seg_list[i + 1]) else: env_nex = get_term_id(PADDING) if env_pre == env_nex and env_pre == get_term_id(PADDING): print('%s/%s ' % (seg_list[i], "_UNDEFINE_"), end='') continue if env_pre == PADDING_ID and seg_list[i + 1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i - 1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i - 2) >= 0: env_pre = get_term_id(seg_list[i - 2]) env_nex = get_term_id(seg_list[i - 1]) if env_pre == PADDING_ID and seg_list[i - 1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i - 2] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i + 2) < len_s: env_pre = get_term_id(seg_list[i + 1]) env_nex = get_term_id(seg_list[i + 2]) if env_pre == PADDING_ID and seg_list[i + 2] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i + 1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) scores = {} for item in RESULTS_WS[seg_list[i]].keys(): scores[item] = 0 hit = 0 for s_ls in score_list: if s_ls not in RESULTS_FQ: continue for w_s in RESULTS_WS[seg_list[i]][item]: if w_s == seg_list[i]: continue if get_term_id(w_s) in RESULTS_FQ[s_ls]: scores[item] += RESULTS_FQ[s_ls][get_term_id(w_s)] hit += 1 print(" %s-%s-[%s-%s]-%d " % (item, w_s, get_term_wd(s_ls >> TAG_SHIFT), get_term_wd(s_ls & TAG_MASK), RESULTS_FQ[s_ls][get_term_id(w_s)])) scores[item] = (scores[item] * hit) / len( RESULTS_WS[seg_list[i]][item]) best_scores = sorted(scores.items(), key=lambda e: e[1], reverse=True) #print(best_scores) if best_scores[0][1] != 0: print('%s/%s ' % (seg_list[i], best_scores[0][0]), end='') else: print('%s/%s ' % (seg_list[i], "_UNDEFINE_"), end='') print() return
def predict_one_shot(str_tst, aimword): global RESULTS_WS global RESULTS_FQ global RESULTS_STOP PADDING_ID = get_term_id(PADDING) str_tst = str_tst.strip() seg_list = pynlpir.segment(str_tst, pos_tagging=False) len_s = len(seg_list) if aimword not in seg_list or len(seg_list) < 5: return None i = seg_list.index(aimword) # 准备周围词 score_list = [] if (i-1) >=0: env_pre = get_term_id(seg_list[i-1]) else: env_pre = get_term_id(PADDING) if (i+1) <len_s: env_nex = get_term_id(seg_list[i+1]) else: env_nex = get_term_id(PADDING) if env_pre == env_nex and env_pre == get_term_id(PADDING): return "_UNDEFINE_" if env_pre == PADDING_ID and seg_list[i+1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i-1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i-2) >= 0: env_pre = get_term_id(seg_list[i-2]) env_nex = get_term_id(seg_list[i-1]) if env_pre == PADDING_ID and seg_list[i-1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i-2] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i+2) < len_s: env_pre = get_term_id(seg_list[i+1]) env_nex = get_term_id(seg_list[i+2]) if env_pre == PADDING_ID and seg_list[i+2] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i+1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) # Predict it! scores = {} for item in RESULTS_WS[seg_list[i]].keys(): scores[item] = 0 hit = 0 for s_ls in score_list: if s_ls not in RESULTS_FQ: continue for w_s in RESULTS_WS[seg_list[i]][item]: if w_s == seg_list[i]: continue if get_term_id(w_s) in RESULTS_FQ[s_ls]: scores[item] += (RESULTS_FQ[s_ls][get_term_id(w_s)] / sum(RESULTS_FQ[s_ls].values())) hit += 1 scores[item] = (scores[item] * hit) / len(RESULTS_WS[seg_list[i]][item]) #scores[item] = ( hit) / len(RESULTS_WS[seg_list[i]][item]) best_scores = sorted(scores.items(), key=lambda e:e[1], reverse=True) #print(best_scores) if best_scores[0][1] != 0: return best_scores[0][0] else: return "_UNDEFINE_"
def predict_sent(str_tst): global RESULTS_WS global RESULTS_FQ global RESULTS_STOP PADDING_ID = get_term_id(PADDING) str_tst = str_tst.strip() seg_list = pynlpir.segment(str_tst, pos_tagging=False) len_s = len(seg_list) for i in range(len_s): if seg_list[i] not in RESULTS_WS: print(seg_list[i], end=' ') else: # 准备周围词 score_list = [] if (i-1) >=0: env_pre = get_term_id(seg_list[i-1]) else: env_pre = get_term_id(PADDING) if (i+1) <len_s: env_nex = get_term_id(seg_list[i+1]) else: env_nex = get_term_id(PADDING) if env_pre == env_nex and env_pre == get_term_id(PADDING): print('%s/%s '%(seg_list[i], "_UNDEFINE_"), end='') continue if env_pre == PADDING_ID and seg_list[i+1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i-1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i-2) >= 0: env_pre = get_term_id(seg_list[i-2]) env_nex = get_term_id(seg_list[i-1]) if env_pre == PADDING_ID and seg_list[i-1] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i-2] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) if (i+2) < len_s: env_pre = get_term_id(seg_list[i+1]) env_nex = get_term_id(seg_list[i+2]) if env_pre == PADDING_ID and seg_list[i+2] in RESULTS_STOP: pass elif env_nex == PADDING_ID and seg_list[i+1] in RESULTS_STOP: pass else: tmp_id = env_pre << TAG_SHIFT | env_nex score_list.append(tmp_id) scores = {} for item in RESULTS_WS[seg_list[i]].keys(): scores[item] = 0 hit = 0 for s_ls in score_list: if s_ls not in RESULTS_FQ: continue for w_s in RESULTS_WS[seg_list[i]][item]: if w_s == seg_list[i]: continue if get_term_id(w_s) in RESULTS_FQ[s_ls]: scores[item] += RESULTS_FQ[s_ls][get_term_id(w_s)] hit += 1 print(" %s-%s-[%s-%s]-%d " %(item, w_s, get_term_wd(s_ls >> TAG_SHIFT), get_term_wd(s_ls & TAG_MASK),RESULTS_FQ[s_ls][get_term_id(w_s)])) scores[item] = (scores[item] * hit) / len(RESULTS_WS[seg_list[i]][item]) best_scores = sorted(scores.items(), key=lambda e:e[1], reverse=True) #print(best_scores) if best_scores[0][1] != 0: print('%s/%s '%(seg_list[i], best_scores[0][0]), end='') else: print('%s/%s '%(seg_list[i], "_UNDEFINE_"), end='') print() return