Ejemplo n.º 1
0
def collect_env():
    global RESULTS_AIM
    global RESULTS_FQ

    RESULTS_FQ = {}

    LINE_NUM = 0

    with open(YLP_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" % (LINE_NUM))
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM))

            line_x = line.split()
            len_x = len(line_x)

            for i in range(len_x):
                if line_x[i] in RESULTS_AIM:
                    c_id = get_term_id(line_x[i])
                    if (i - 1) >= 0: w_id_p = get_term_id(line_x[i - 1])
                    else: w_id_p = get_term_id(PADDING)
                    if (i + 1) < len_x: w_id_n = get_term_id(line_x[i + 1])
                    else: w_id_n = get_term_id(PADDING)
                    append_to_fq(c_id, w_id_p, w_id_n)

                    if (i - 2) >= 0:
                        w_id_p = get_term_id(line_x[i - 2])
                        w_id_n = get_term_id(line_x[i - 1])
                        append_to_fq(c_id, w_id_p, w_id_n)

                    if (i + 2) < len_x:
                        w_id_p = get_term_id(line_x[i + 1])
                        w_id_n = get_term_id(line_x[i + 2])
                        append_to_fq(c_id, w_id_p, w_id_n)

    return
Ejemplo n.º 2
0
def collect_env():
    global RESULTS_AIM
    global RESULTS_FQ
    
    RESULTS_FQ = {}

    LINE_NUM = 0

    with open(YLP_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM))

            line_x = line.split()
            len_x  = len(line_x)

            for i in range(len_x):
                if line_x[i] in RESULTS_AIM:
                    c_id = get_term_id(line_x[i])                     
                    if (i-1) >= 0:    w_id_p =  get_term_id(line_x[i-1])
                    else:             w_id_p =  get_term_id(PADDING)
                    if (i+1) < len_x: w_id_n =  get_term_id(line_x[i+1])
                    else:             w_id_n =  get_term_id(PADDING)
                    append_to_fq(c_id, w_id_p, w_id_n)

                    if (i-2) >= 0:
                        w_id_p =  get_term_id(line_x[i-2])
                        w_id_n =  get_term_id(line_x[i-1])
                        append_to_fq(c_id, w_id_p, w_id_n)

                    if (i+2) < len_x:
                        w_id_p =  get_term_id(line_x[i+1])
                        w_id_n =  get_term_id(line_x[i+2])
                        append_to_fq(c_id, w_id_p, w_id_n)

    return
Ejemplo n.º 3
0
def append_to_fq(c_id, w_id_p, w_id_n):
    global RESULTS_FQ

    if w_id_p == w_id_n and w_id_p == get_term_id(PADDING):
        return

    tmp_id = w_id_p << TAG_SHIFT | w_id_n

    if tmp_id not in RESULTS_FQ:
        RESULTS_FQ[tmp_id] = {}
        RESULTS_FQ[tmp_id][c_id] = 1
    else:
        if c_id not in RESULTS_FQ[tmp_id]:
            RESULTS_FQ[tmp_id][c_id] = 1
        else:
            RESULTS_FQ[tmp_id][c_id] += 1
    return
Ejemplo n.º 4
0
def append_to_fq(c_id, w_id_p, w_id_n):
    global RESULTS_FQ
    
    if w_id_p == w_id_n and w_id_p == get_term_id(PADDING):
        return

    tmp_id = w_id_p << TAG_SHIFT | w_id_n

    if tmp_id not in RESULTS_FQ:
        RESULTS_FQ[tmp_id] = {}
        RESULTS_FQ[tmp_id][c_id] = 1
    else:
        if c_id not in RESULTS_FQ[tmp_id]:
            RESULTS_FQ[tmp_id][c_id] = 1
        else:
            RESULTS_FQ[tmp_id][c_id] += 1
    return
Ejemplo n.º 5
0
def predict_one_shot(str_tst, aimword):
    global RESULTS_WS
    global RESULTS_FQ
    global RESULTS_STOP

    PADDING_ID = get_term_id(PADDING)

    str_tst = str_tst.strip()
    seg_list = pynlpir.segment(str_tst, pos_tagging=False)
    len_s = len(seg_list)

    if aimword not in seg_list or len(seg_list) < 5:
        return None

    i = seg_list.index(aimword)

    # 准备周围词
    score_list = []
    if (i - 1) >= 0: env_pre = get_term_id(seg_list[i - 1])
    else: env_pre = get_term_id(PADDING)
    if (i + 1) < len_s: env_nex = get_term_id(seg_list[i + 1])
    else: env_nex = get_term_id(PADDING)
    if env_pre == env_nex and env_pre == get_term_id(PADDING):
        return "_UNDEFINE_"

    if env_pre == PADDING_ID and seg_list[i + 1] in RESULTS_STOP:
        pass
    elif env_nex == PADDING_ID and seg_list[i - 1] in RESULTS_STOP:
        pass
    else:
        tmp_id = env_pre << TAG_SHIFT | env_nex
        score_list.append(tmp_id)

    if (i - 2) >= 0:
        env_pre = get_term_id(seg_list[i - 2])
        env_nex = get_term_id(seg_list[i - 1])
        if env_pre == PADDING_ID and seg_list[i - 1] in RESULTS_STOP:
            pass
        elif env_nex == PADDING_ID and seg_list[i - 2] in RESULTS_STOP:
            pass
        else:
            tmp_id = env_pre << TAG_SHIFT | env_nex
            score_list.append(tmp_id)

    if (i + 2) < len_s:
        env_pre = get_term_id(seg_list[i + 1])
        env_nex = get_term_id(seg_list[i + 2])
        if env_pre == PADDING_ID and seg_list[i + 2] in RESULTS_STOP:
            pass
        elif env_nex == PADDING_ID and seg_list[i + 1] in RESULTS_STOP:
            pass
        else:
            tmp_id = env_pre << TAG_SHIFT | env_nex
            score_list.append(tmp_id)

    # Predict it!
    scores = {}
    for item in RESULTS_WS[seg_list[i]].keys():
        scores[item] = 0
        hit = 0
        for s_ls in score_list:
            if s_ls not in RESULTS_FQ: continue
            for w_s in RESULTS_WS[seg_list[i]][item]:
                if w_s == seg_list[i]: continue
                if get_term_id(w_s) in RESULTS_FQ[s_ls]:
                    scores[item] += (RESULTS_FQ[s_ls][get_term_id(w_s)] /
                                     sum(RESULTS_FQ[s_ls].values()))
                    hit += 1
        scores[item] = (scores[item] * hit) / len(
            RESULTS_WS[seg_list[i]][item])
        #scores[item] = ( hit) / len(RESULTS_WS[seg_list[i]][item])
    best_scores = sorted(scores.items(), key=lambda e: e[1], reverse=True)

    #print(best_scores)
    if best_scores[0][1] != 0:
        return best_scores[0][0]
    else:
        return "_UNDEFINE_"
Ejemplo n.º 6
0
def predict_sent(str_tst):
    global RESULTS_WS
    global RESULTS_FQ
    global RESULTS_STOP

    PADDING_ID = get_term_id(PADDING)

    str_tst = str_tst.strip()
    seg_list = pynlpir.segment(str_tst, pos_tagging=False)
    len_s = len(seg_list)
    for i in range(len_s):

        if seg_list[i] not in RESULTS_WS:
            print(seg_list[i], end=' ')
        else:
            # 准备周围词
            score_list = []
            if (i - 1) >= 0: env_pre = get_term_id(seg_list[i - 1])
            else: env_pre = get_term_id(PADDING)
            if (i + 1) < len_s: env_nex = get_term_id(seg_list[i + 1])
            else: env_nex = get_term_id(PADDING)
            if env_pre == env_nex and env_pre == get_term_id(PADDING):
                print('%s/%s ' % (seg_list[i], "_UNDEFINE_"), end='')
                continue

            if env_pre == PADDING_ID and seg_list[i + 1] in RESULTS_STOP:
                pass
            elif env_nex == PADDING_ID and seg_list[i - 1] in RESULTS_STOP:
                pass
            else:
                tmp_id = env_pre << TAG_SHIFT | env_nex
                score_list.append(tmp_id)

            if (i - 2) >= 0:
                env_pre = get_term_id(seg_list[i - 2])
                env_nex = get_term_id(seg_list[i - 1])
                if env_pre == PADDING_ID and seg_list[i - 1] in RESULTS_STOP:
                    pass
                elif env_nex == PADDING_ID and seg_list[i - 2] in RESULTS_STOP:
                    pass
                else:
                    tmp_id = env_pre << TAG_SHIFT | env_nex
                    score_list.append(tmp_id)

            if (i + 2) < len_s:
                env_pre = get_term_id(seg_list[i + 1])
                env_nex = get_term_id(seg_list[i + 2])
                if env_pre == PADDING_ID and seg_list[i + 2] in RESULTS_STOP:
                    pass
                elif env_nex == PADDING_ID and seg_list[i + 1] in RESULTS_STOP:
                    pass
                else:
                    tmp_id = env_pre << TAG_SHIFT | env_nex
                    score_list.append(tmp_id)

            scores = {}
            for item in RESULTS_WS[seg_list[i]].keys():
                scores[item] = 0
                hit = 0
                for s_ls in score_list:
                    if s_ls not in RESULTS_FQ: continue
                    for w_s in RESULTS_WS[seg_list[i]][item]:
                        if w_s == seg_list[i]: continue
                        if get_term_id(w_s) in RESULTS_FQ[s_ls]:
                            scores[item] += RESULTS_FQ[s_ls][get_term_id(w_s)]
                            hit += 1
                            print(" %s-%s-[%s-%s]-%d " %
                                  (item, w_s, get_term_wd(s_ls >> TAG_SHIFT),
                                   get_term_wd(s_ls & TAG_MASK),
                                   RESULTS_FQ[s_ls][get_term_id(w_s)]))
                scores[item] = (scores[item] * hit) / len(
                    RESULTS_WS[seg_list[i]][item])
            best_scores = sorted(scores.items(),
                                 key=lambda e: e[1],
                                 reverse=True)

            #print(best_scores)
            if best_scores[0][1] != 0:
                print('%s/%s ' % (seg_list[i], best_scores[0][0]), end='')
            else:
                print('%s/%s ' % (seg_list[i], "_UNDEFINE_"), end='')
    print()
    return
Ejemplo n.º 7
0
def predict_one_shot(str_tst, aimword):
    global RESULTS_WS
    global RESULTS_FQ
    global RESULTS_STOP

    PADDING_ID = get_term_id(PADDING)

    str_tst = str_tst.strip()
    seg_list = pynlpir.segment(str_tst, pos_tagging=False)
    len_s = len(seg_list)

    if aimword not in seg_list or len(seg_list) < 5:
        return None

    i = seg_list.index(aimword)

    # 准备周围词
    score_list = []
    if (i-1) >=0: env_pre = get_term_id(seg_list[i-1])
    else: env_pre = get_term_id(PADDING)
    if (i+1) <len_s: env_nex = get_term_id(seg_list[i+1])
    else: env_nex = get_term_id(PADDING)
    if env_pre == env_nex and env_pre == get_term_id(PADDING):
        return "_UNDEFINE_"

    if env_pre == PADDING_ID and seg_list[i+1] in RESULTS_STOP:
        pass
    elif env_nex == PADDING_ID and seg_list[i-1] in RESULTS_STOP:
        pass
    else:
        tmp_id = env_pre << TAG_SHIFT | env_nex 
        score_list.append(tmp_id)

    if (i-2) >= 0:
        env_pre =  get_term_id(seg_list[i-2])
        env_nex =  get_term_id(seg_list[i-1])
        if env_pre == PADDING_ID and seg_list[i-1] in RESULTS_STOP:
            pass
        elif env_nex == PADDING_ID and seg_list[i-2] in RESULTS_STOP:
            pass
        else:
            tmp_id = env_pre << TAG_SHIFT | env_nex 
            score_list.append(tmp_id)

    if (i+2) < len_s:
        env_pre =  get_term_id(seg_list[i+1])
        env_nex =  get_term_id(seg_list[i+2])
        if env_pre == PADDING_ID and seg_list[i+2] in RESULTS_STOP:
            pass
        elif env_nex == PADDING_ID and seg_list[i+1] in RESULTS_STOP:
            pass
        else:
            tmp_id = env_pre << TAG_SHIFT | env_nex 
            score_list.append(tmp_id)

    # Predict it!
    scores = {}
    for item in RESULTS_WS[seg_list[i]].keys():
        scores[item] = 0
        hit = 0
        for s_ls in score_list:
            if s_ls not in RESULTS_FQ: continue
            for w_s in RESULTS_WS[seg_list[i]][item]:
                if w_s == seg_list[i]: continue
                if get_term_id(w_s) in RESULTS_FQ[s_ls]:
                    scores[item] += (RESULTS_FQ[s_ls][get_term_id(w_s)] / sum(RESULTS_FQ[s_ls].values()))
                    hit += 1
        scores[item] = (scores[item] * hit) / len(RESULTS_WS[seg_list[i]][item])
        #scores[item] = ( hit) / len(RESULTS_WS[seg_list[i]][item])
    best_scores = sorted(scores.items(), key=lambda e:e[1], reverse=True)

    #print(best_scores)
    if best_scores[0][1] != 0:
        return best_scores[0][0]
    else:
        return "_UNDEFINE_"
Ejemplo n.º 8
0
def predict_sent(str_tst):
    global RESULTS_WS
    global RESULTS_FQ
    global RESULTS_STOP

    PADDING_ID = get_term_id(PADDING)

    str_tst = str_tst.strip()
    seg_list = pynlpir.segment(str_tst, pos_tagging=False)
    len_s = len(seg_list)
    for i in range(len_s):

        if seg_list[i] not in RESULTS_WS:
            print(seg_list[i], end=' ')
        else:
            # 准备周围词
            score_list = []
            if (i-1) >=0: env_pre = get_term_id(seg_list[i-1])
            else: env_pre = get_term_id(PADDING)
            if (i+1) <len_s: env_nex = get_term_id(seg_list[i+1])
            else: env_nex = get_term_id(PADDING)
            if env_pre == env_nex and env_pre == get_term_id(PADDING):
                print('%s/%s '%(seg_list[i], "_UNDEFINE_"), end='')
                continue

            if env_pre == PADDING_ID and seg_list[i+1] in RESULTS_STOP:
                pass
            elif env_nex == PADDING_ID and seg_list[i-1] in RESULTS_STOP:
                pass
            else:
                tmp_id = env_pre << TAG_SHIFT | env_nex 
                score_list.append(tmp_id)

            if (i-2) >= 0:
                env_pre =  get_term_id(seg_list[i-2])
                env_nex =  get_term_id(seg_list[i-1])
                if env_pre == PADDING_ID and seg_list[i-1] in RESULTS_STOP:
                    pass
                elif env_nex == PADDING_ID and seg_list[i-2] in RESULTS_STOP:
                    pass
                else:
                    tmp_id = env_pre << TAG_SHIFT | env_nex 
                    score_list.append(tmp_id)

            if (i+2) < len_s:
                env_pre =  get_term_id(seg_list[i+1])
                env_nex =  get_term_id(seg_list[i+2])
                if env_pre == PADDING_ID and seg_list[i+2] in RESULTS_STOP:
                    pass
                elif env_nex == PADDING_ID and seg_list[i+1] in RESULTS_STOP:
                    pass
                else:
                    tmp_id = env_pre << TAG_SHIFT | env_nex 
                    score_list.append(tmp_id)


            scores = {}
            for item in RESULTS_WS[seg_list[i]].keys():
                scores[item] = 0
                hit = 0
                for s_ls in score_list:
                    if s_ls not in RESULTS_FQ: continue
                    for w_s in RESULTS_WS[seg_list[i]][item]:
                        if w_s == seg_list[i]: continue
                        if get_term_id(w_s) in RESULTS_FQ[s_ls]:
                            scores[item] += RESULTS_FQ[s_ls][get_term_id(w_s)]
                            hit += 1
                            print(" %s-%s-[%s-%s]-%d " %(item, w_s, get_term_wd(s_ls >> TAG_SHIFT), get_term_wd(s_ls & TAG_MASK),RESULTS_FQ[s_ls][get_term_id(w_s)]))
                scores[item] = (scores[item] * hit) / len(RESULTS_WS[seg_list[i]][item])
            best_scores = sorted(scores.items(), key=lambda e:e[1], reverse=True)

            #print(best_scores)
            if best_scores[0][1] != 0:
                print('%s/%s '%(seg_list[i], best_scores[0][0]), end='')
            else:
                print('%s/%s '%(seg_list[i], "_UNDEFINE_"), end='')
    print()
    return