コード例 #1
0
def sub_correct_me_ext(head_t, check_t, tail_t, type_t):
    if not check_t:
        return None

    if not head_t and not tail_t:
        return None

    p_res_1 = {}
    p_res_2 = {}
    max_pro = 0
    max_item = None

    if head_t:
        pinyin_t = pinyin.word2pinyin_split(
            head_t, '-') + '-' + pinyin.word2pinyin_split(check_t, '-')
        pinyin_ts = hash_pinyin(pinyin_t)
        for pinyin_t in pinyin_ts:
            if pinyin_t in JIEBA_PINYIN.keys():
                list_t = JIEBA_PINYIN.get(pinyin_t)
                for item in list_t:
                    if head_t != item[0:len(head_t)]:
                        continue
                    else:
                        p_res_1[item[len(head_t):]] = JIEBA_HZ.get(item)

    if tail_t:
        pinyin_t = pinyin.word2pinyin_split(
            check_t, '-') + '-' + pinyin.word2pinyin_split(tail_t, '-')
        pinyin_ts = hash_pinyin(pinyin_t)
        for pinyin_t in pinyin_ts:
            if pinyin_t in JIEBA_PINYIN.keys():
                list_t = JIEBA_PINYIN.get(pinyin_t)
                for item in list_t:
                    if tail_t != item[len(check_t):]:
                        continue
                    else:
                        p_res_2[item[0:len(check_t)]] = JIEBA_HZ.get(item)

    if not p_res_1:
        p_res_1 = p_res_2
    elif not p_res_2:
        p_res_2 = p_res_1

    p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_1[item] * p_res_2[item] / (p_res_1[item] +
                                                                p_res_2[item])
            if p_res_intr[item] > max_pro:
                max_pro = p_res_intr[item]
                max_item = item

    return ({
        'item': max_item,
        'pro': max_pro,
        'data': p_res_intr,
        'type': type_t
    })
コード例 #2
0
def sub_correct_me_ext(head_t, check_t, tail_t, type_t):
    if not check_t:
        return None

    if not head_t and not tail_t:
        return None

    p_res_1 = {}
    p_res_2 = {}
    max_pro = 0
    max_item = None
    
    if head_t:    
        pinyin_t = pinyin.word2pinyin_split(head_t,'-') + '-' + pinyin.word2pinyin_split(check_t,'-')
        pinyin_ts = hash_pinyin(pinyin_t)
        for pinyin_t in pinyin_ts:
            if pinyin_t in JIEBA_PINYIN.keys():
                list_t = JIEBA_PINYIN.get(pinyin_t)
                for item in list_t:
                    if head_t != item[0:len(head_t)]:
                        continue
                    else:
                        p_res_1[item[len(head_t):]] = JIEBA_HZ.get(item)
    
    if tail_t:
        pinyin_t = pinyin.word2pinyin_split(check_t,'-') + '-' + pinyin.word2pinyin_split(tail_t,'-')
        pinyin_ts = hash_pinyin(pinyin_t)
        for pinyin_t in pinyin_ts:
            if pinyin_t in JIEBA_PINYIN.keys():
                list_t = JIEBA_PINYIN.get(pinyin_t)
                for item in list_t:
                    if tail_t != item[len(check_t):]:
                        continue
                    else:
                        p_res_2[item[0:len(check_t)]] = JIEBA_HZ.get(item)
    
    if not p_res_1:
        p_res_1 = p_res_2
    elif not p_res_2:
        p_res_2 = p_res_1

    p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_1[item]*p_res_2[item] / (p_res_1[item] + p_res_2[item])
            if p_res_intr[item] > max_pro:
                max_pro = p_res_intr[item]
                max_item = item
    
    return ({'item':max_item, 'pro':max_pro, 'data':p_res_intr, 'type': type_t})
コード例 #3
0
            for line in fin:
                i = i + 1
                if not i % 1000:
                    print("C:%d" % (i))
                if not line.find("\\2-grams:"):
                    start_flag = 1
                    continue
                if not line.find("\\end\\"):
                    continue
                if start_flag:
                    aa = line.split()
                    pinyin_t = ""
                    if aa:
                        item_key = aa[1] + aa[2]
                        if aa[1] == "<s>" or aa[1] == "</s>":
                            pinyin_t = aa[1] + "-" + pinyin.word2pinyin_split(aa[2], "-")
                        elif aa[2] == "<s>" or aa[2] == "</s>":
                            pinyin_t = pinyin.word2pinyin_split(aa[1], "-") + "-" + aa[2]
                        else:
                            pinyin_t = pinyin.word2pinyin_split(aa[1], "-") + "-" + pinyin.word2pinyin_split(aa[2], "-")
                        if pinyin_t in JIEBA_PINYIN.keys():
                            JIEBA_PINYIN[pinyin_t].append(item_key)
                        else:
                            JIEBA_PINYIN[pinyin_t] = [item_key]
    # 保存
    print("保存JIEBA词频信息")
    with open(FILE_NAME_JIEBA_PINYIN, "wb") as fout:
        pickle.dump(JIEBA_PINYIN, fout, True)


print("你好,处理完毕")
コード例 #4
0
def correct_me(str_test):
    #    str_len = len(str_test)
    #    print("\n==单字测试==")
    #    for i in range(1,str_len):
    #        tmp_str = str_test[i-1] + str_test[i]
    #        if is_terminator(str_test[i]):
    #            tmp_str = str_test[i-1] + '</s>'
    #        if is_terminator(str_test[i-1]):
    #            tmp_str = '<s>' + str_test[i]
    #        if UNIC_HZ.get(tmp_str):
    #            print("%s->%f" % (tmp_str,UNIC_HZ.get(tmp_str)),end="\t")
    #        else:
    #            print("%s->%f" % (tmp_str,0),end="\t")

    print("")
    print("==NLPIR分词==")
    print("测试语句:%s" % (str_test))
    line_p = hanzi_prep.split_into_sentences(str_test)
    lines = []
    for line_i in line_p:
        lines.extend(line_i)
    str_i = ''.join(lines)
    #jieba_i = ' '.join(jieba.cut(str_i, cut_all=False))
    jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False))
    print("分词结果:%s" % (repr(jieba_i)))
    jieba_i = jieba_i.split()
    jieba_len = len(jieba_i)
    if jieba_len < 3:
        print("词数太小,放弃纠错!")
        return
    jieba_key = []
    jieba_pro = []
    for i in range(1, jieba_len):

        #是否是标点符号
        #        if i == 0:
        #            tmp_str = '<s>' + jieba_i[i]
        #        if i == jieba_len -1:
        #            tmp_str = jieba_i[i] + '</s>'
        #        else:
        #            #默认模式
        #            tmp_str = jieba_i[i-1] + jieba_i[i]
        #            if len(jieba_i[i]) == 1:
        #                if is_terminator(jieba_i[i]):
        #                    tmp_str = jieba_i[i-1] + '</s>'
        #            if len(jieba_i[i-1]) == 1:
        #                if is_terminator(jieba_i[i-1]):
        #                    tmp_str = '<s>' + jieba_i[i]
        #不考虑开头结尾模式
        tmp_str = jieba_i[i - 1] + jieba_i[i]
        pro = JIEBA_HZ.get(tmp_str)

        jieba_key.append(tmp_str)
        if pro:
            jieba_pro.append(pro)
        else:
            jieba_pro.append(0)

#    if min_index != -1:
#        print("\n可能错误位置:",end="")
#        if min_index > 1:
#            print("%s"%jieba_i[min_index-1],end="")
#        print("%s"%jieba_i[min_index])
#        if min_index < (jieba_len - 1):
#            print("%s"%jieba_i[min_index+1],end="")
    print("分词表:" + repr(jieba_key))
    print("概率表:" + repr(jieba_pro))

    jieba_pro_t = []
    for i in range(0, jieba_len - 2):
        jieba_pro_t.append(jieba_pro[i] + jieba_pro[i + 1])

    min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1
    print("可疑位置:[%d]->%s" % (min_index, jieba_i[min_index]))
    to_do = []
    g_check_a = None
    g_check_e = None
    #纠错位置不可能在开头或者结尾
    to_do.append(jieba_i[min_index - 1])
    to_do.append(jieba_i[min_index])
    to_do.append(jieba_i[min_index + 1])
    if min_index - 2 >= 0:
        g_check_a = jieba_i[min_index - 2]
    if min_index + 2 < jieba_len:
        g_check_e = jieba_i[min_index + 2]

    print("需要处理:" + repr(to_do))
    print("辅助检测:%s,%s" % (g_check_a, g_check_e))

    #保存最终的结果
    p_res_stage1 = {}
    p_res_stage2 = {}
    p_res_stage3 = {}
    max_item_1 = None
    max_item_2 = None
    max_item_3 = None
    max_pro_1 = 0
    max_item_1 = None
    max_pro_2 = 0
    max_item_2 = None
    max_pro_3 = 0
    max_item_3 = None

    #STAGE1 假设分词没有错误
    pinyin_t = pinyin.word2pinyin_split(
        to_do[0], '-') + '-' + pinyin.word2pinyin_split(to_do[1], '-')
    p_res_1 = {}
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do[0] != item[0:len(to_do[0])]:
                continue
            else:
                p_res_1[item[len(to_do[0]):]] = JIEBA_HZ.get(item)

    pinyin_t = pinyin.word2pinyin_split(
        to_do[1], '-') + '-' + pinyin.word2pinyin_split(to_do[2], '-')
    p_res_2 = {}
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do[2] != item[len(to_do[1]):]:
                continue
            else:
                p_res_2[item[0:len(to_do[1])]] = JIEBA_HZ.get(item)
                #print("2.找到:%s-%s,概率%f\t" %(to_do[0],item,JIEBA_HZ.get(item)))

    p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2)
    if p_res_intr:
        max_pro_1 = 0
        max_item_1 = None
        for item in p_res_intr:
            p_res_intr[item] = p_res_1[item] * p_res_2[item] / (p_res_1[item] +
                                                                p_res_2[item])
            if p_res_intr[item] > max_pro_1:
                max_pro_1 = p_res_intr[item]
                max_item_1 = item
        print(repr(p_res_intr))
        p_res_stage1 = p_res_intr

    #STAGE2 假设第一和第二个合并
    to_do_a = [to_do[0] + to_do[1], to_do[2]]
    p_res_3 = {}
    p_res_s3 = {}
    pinyin_t = pinyin.word2pinyin_split(
        to_do_a[0], '-') + '-' + pinyin.word2pinyin_split(to_do_a[1], '-')
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            print(item)
            if to_do_a[1] != item[len(to_do_a[1]):]:
                continue
            else:
                p_res_3[item[:len(to_do_a[1])]] = JIEBA_HZ.get(item)

    if g_check_a:
        for item in p_res_3:
            item_t = g_check_a + item
            if item_t in JIEBA_HZ.keys():
                p_res_s3[item] = JIEBA_HZ.get(item_t)
    else:
        p_res_s3 = p_res_3

    p_res_intr = dict.fromkeys(x for x in p_res_3 if x in p_res_s3)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_3[item] * p_res_s3[item] / (
                p_res_3[item] + p_res_s3[item])
            if p_res_intr[item] > max_pro_2:
                max_pro_2 = p_res_intr[item]
                max_item_2 = item
        p_res_stage2 = p_res_intr

#STAGE3 假设第二和第三个合并
    to_do_b = [to_do[0], to_do[1] + to_do[2]]
    p_res_4 = {}
    p_res_s4 = {}
    pinyin_t = pinyin.word2pinyin_split(
        to_do_b[0], '-') + '-' + pinyin.word2pinyin_split(to_do_b[1], '-')
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do_b[0] != item[0:len(to_do_b[0])]:
                continue
            else:
                p_res_4[item[len(to_do_b[0]):]] = JIEBA_HZ.get(item)

    if g_check_e:
        for item in p_res_4:
            item_t = item + g_check_e
            if item_t in JIEBA_HZ.keys():
                p_res_s4[item] = JIEBA_HZ.get(item_t)
    else:
        p_res_s4 = p_res_4

    p_res_intr = dict.fromkeys(x for x in p_res_4 if x in p_res_s4)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_4[item] * p_res_s4[item] / (
                p_res_4[item] + p_res_s4[item])
            if p_res_intr[item] > max_pro_3:
                max_pro_3 = p_res_intr[item]
                max_item_3 = item
        print(repr(p_res_intr))
        p_res_stage3 = p_res_intr

    #打印纠正结果
    if max_item_1:
        print("STAGE1:纠错结果:%s %s %s,概率%f" %
              (to_do[0], max_item_1, to_do[2], p_res_stage1[max_item_1]))
    else:
        print("STAGE1:纠错失败")
    if max_item_2:
        print("STAGE2:纠错结果:%s %s,概率%f" %
              (max_item_2, to_do_a[1], p_res_stage2[max_item_2]))
    else:
        print("STAGE2:纠错失败")
    if max_item_3:
        print("STAGE3:纠错结果:%s %s,概率%f" %
              (to_do_b[0], max_item_3, p_res_stage3[max_item_3]))
    else:
        print("STAGE3:纠错失败")

    max_pro = max([max_pro_1, max_pro_2, max_pro_3])
    if max_pro != 0:
        if max_pro == max_pro_1:
            final_words = jieba_i[0:min_index - 1] + [
                to_do[0], max_item_1, to_do[2]
            ] + jieba_i[min_index + 2:jieba_len]
        elif max_pro == max_pro_2:
            final_words = jieba_i[0:min_index - 1] + [
                max_item_2, to_do_a[1]
            ] + jieba_i[min_index + 2:jieba_len]
        elif max_pro == max_pro_3:
            final_words = jieba_i[0:min_index - 1] + [
                to_do_b[0], max_item_3
            ] + jieba_i[min_index + 2:jieba_len]

        print("原句: " + str_test)
        print("纠正:" + ''.join(final_words))
        return (''.join(final_words))
    else:
        print('纠错失败')
        return None
コード例 #5
0
def correct_me(str_test):
#    str_len = len(str_test)
#    print("\n==单字测试==")
#    for i in range(1,str_len):
#        tmp_str = str_test[i-1] + str_test[i]
#        if is_terminator(str_test[i]):
#            tmp_str = str_test[i-1] + '</s>'
#        if is_terminator(str_test[i-1]):
#            tmp_str = '<s>' + str_test[i]
#        if UNIC_HZ.get(tmp_str):
#            print("%s->%f" % (tmp_str,UNIC_HZ.get(tmp_str)),end="\t")
#        else:
#            print("%s->%f" % (tmp_str,0),end="\t")


    print("")
    print("==NLPIR分词==")
    print("测试语句:%s" %(str_test))
    line_p = hanzi_prep.split_into_sentences(str_test)
    lines = []
    for line_i in line_p:
        lines.extend(line_i)
    str_i = ''.join(lines)
    #jieba_i = ' '.join(jieba.cut(str_i, cut_all=False))
    jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False))
    print("分词结果:%s"%(repr(jieba_i)))
    jieba_i = jieba_i.split()
    jieba_len = len(jieba_i)
    if jieba_len < 3:
        print("词数太小,放弃纠错!")
        return
    jieba_key = []
    jieba_pro = []
    for i in range(1,jieba_len):

        #是否是标点符号
#        if i == 0:
#            tmp_str = '<s>' + jieba_i[i]
#        if i == jieba_len -1:
#            tmp_str = jieba_i[i] + '</s>'
#        else:
#            #默认模式
#            tmp_str = jieba_i[i-1] + jieba_i[i]
#            if len(jieba_i[i]) == 1:
#                if is_terminator(jieba_i[i]):
#                    tmp_str = jieba_i[i-1] + '</s>'
#            if len(jieba_i[i-1]) == 1:
#                if is_terminator(jieba_i[i-1]):
#                    tmp_str = '<s>' + jieba_i[i]
        #不考虑开头结尾模式
        tmp_str = jieba_i[i-1] + jieba_i[i]
        pro = JIEBA_HZ.get(tmp_str)

        jieba_key.append(tmp_str)
        if pro:
            jieba_pro.append(pro)
        else:
            jieba_pro.append(0)

#    if min_index != -1:
#        print("\n可能错误位置:",end="")
#        if min_index > 1:
#            print("%s"%jieba_i[min_index-1],end="")
#        print("%s"%jieba_i[min_index])
#        if min_index < (jieba_len - 1):
#            print("%s"%jieba_i[min_index+1],end="")
    print("分词表:"+repr(jieba_key))
    print("概率表:"+repr(jieba_pro))

    jieba_pro_t = []
    for i in range(0,jieba_len-2):
        jieba_pro_t.append( jieba_pro[i] + jieba_pro[i+1])

    min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1
    print("可疑位置:[%d]->%s"%(min_index,jieba_i[min_index]))
    to_do = []
    g_check_a = None
    g_check_e = None
    #纠错位置不可能在开头或者结尾
    to_do.append(jieba_i[min_index-1])
    to_do.append(jieba_i[min_index])
    to_do.append(jieba_i[min_index+1])
    if min_index - 2 >= 0:
        g_check_a = jieba_i[min_index-2]
    if min_index + 2 < jieba_len:
        g_check_e = jieba_i[min_index+2]

    print("需要处理:"+repr(to_do))
    print("辅助检测:%s,%s" %(g_check_a, g_check_e))

    #保存最终的结果
    p_res_stage1 = {}
    p_res_stage2 = {}
    p_res_stage3 = {}
    max_item_1 = None
    max_item_2 = None
    max_item_3 = None
    max_pro_1 = 0
    max_item_1 = None
    max_pro_2 = 0
    max_item_2 = None
    max_pro_3 = 0
    max_item_3 = None

    #STAGE1 假设分词没有错误
    pinyin_t = pinyin.word2pinyin_split(to_do[0],'-') + '-' + pinyin.word2pinyin_split(to_do[1],'-')
    p_res_1 = {}
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do[0] != item[0:len(to_do[0])]:
                continue
            else:
                p_res_1[item[len(to_do[0]):]] = JIEBA_HZ.get(item)

    pinyin_t = pinyin.word2pinyin_split(to_do[1],'-') + '-' + pinyin.word2pinyin_split(to_do[2],'-')
    p_res_2 = {}
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do[2] != item[len(to_do[1]):]:
                continue
            else:
                p_res_2[item[0:len(to_do[1])]] = JIEBA_HZ.get(item)
                #print("2.找到:%s-%s,概率%f\t" %(to_do[0],item,JIEBA_HZ.get(item)))

    p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2)
    if p_res_intr:
        max_pro_1 = 0
        max_item_1 = None
        for item in p_res_intr:
            p_res_intr[item] = p_res_1[item]*p_res_2[item] / (p_res_1[item] + p_res_2[item])
            if p_res_intr[item] > max_pro_1:
                max_pro_1 = p_res_intr[item]
                max_item_1 = item
        print(repr(p_res_intr))
        p_res_stage1 = p_res_intr

    #STAGE2 假设第一和第二个合并
    to_do_a = [to_do[0]+to_do[1], to_do[2]]
    p_res_3 = {}
    p_res_s3 = {}
    pinyin_t = pinyin.word2pinyin_split(to_do_a[0],'-') + '-' + pinyin.word2pinyin_split(to_do_a[1],'-')
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            print(item)
            if to_do_a[1] != item[len(to_do_a[1]):]:
                continue
            else:
                p_res_3[item[:len(to_do_a[1])]] = JIEBA_HZ.get(item)

    if g_check_a:
        for item in p_res_3:
            item_t = g_check_a+item
            if item_t in JIEBA_HZ.keys():
                p_res_s3[item] = JIEBA_HZ.get(item_t)
    else:
        p_res_s3 = p_res_3

    p_res_intr = dict.fromkeys(x for x in p_res_3 if x in p_res_s3)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_3[item]*p_res_s3[item] / (p_res_3[item] + p_res_s3[item])
            if p_res_intr[item] > max_pro_2:
                max_pro_2 = p_res_intr[item]
                max_item_2 = item
        p_res_stage2 = p_res_intr

#STAGE3 假设第二和第三个合并
    to_do_b = [to_do[0], to_do[1]+to_do[2]]
    p_res_4 = {}
    p_res_s4 = {}
    pinyin_t = pinyin.word2pinyin_split(to_do_b[0],'-') + '-' + pinyin.word2pinyin_split(to_do_b[1],'-')
    if pinyin_t in JIEBA_PINYIN.keys():
        list_t = JIEBA_PINYIN.get(pinyin_t)
        for item in list_t:
            if to_do_b[0] != item[0:len(to_do_b[0])]:
                continue
            else:
                p_res_4[item[len(to_do_b[0]):]] = JIEBA_HZ.get(item)

    if g_check_e:
        for item in p_res_4:
            item_t = item + g_check_e
            if item_t in JIEBA_HZ.keys():
                p_res_s4[item] = JIEBA_HZ.get(item_t)
    else:
        p_res_s4 = p_res_4

    p_res_intr = dict.fromkeys(x for x in p_res_4 if x in p_res_s4)
    if p_res_intr:
        for item in p_res_intr:
            p_res_intr[item] = p_res_4[item]*p_res_s4[item] / (p_res_4[item] + p_res_s4[item])
            if p_res_intr[item] > max_pro_3:
                max_pro_3 = p_res_intr[item]
                max_item_3 = item
        print(repr(p_res_intr))
        p_res_stage3 = p_res_intr


    #打印纠正结果
    if max_item_1:
        print("STAGE1:纠错结果:%s %s %s,概率%f"%(to_do[0],max_item_1,to_do[2],p_res_stage1[max_item_1]))
    else:
        print("STAGE1:纠错失败")
    if max_item_2:
        print("STAGE2:纠错结果:%s %s,概率%f"%(max_item_2,to_do_a[1],p_res_stage2[max_item_2]))
    else:
        print("STAGE2:纠错失败")
    if max_item_3:
        print("STAGE3:纠错结果:%s %s,概率%f"%(to_do_b[0],max_item_3,p_res_stage3[max_item_3]))
    else:
        print("STAGE3:纠错失败")

    max_pro = max([max_pro_1, max_pro_2, max_pro_3])
    if max_pro != 0:
        if max_pro == max_pro_1:
            final_words = jieba_i[0:min_index-1] + [ to_do[0], max_item_1, to_do[2] ] + jieba_i[min_index+2:jieba_len]
        elif max_pro == max_pro_2:
            final_words = jieba_i[0:min_index-1] + [ max_item_2, to_do_a[1] ] + jieba_i[min_index+2:jieba_len]
        elif max_pro == max_pro_3:
            final_words = jieba_i[0:min_index-1] + [ to_do_b[0], max_item_3 ] + jieba_i[min_index+2:jieba_len]

        print("原句: "+str_test)
        print("纠正:"+''.join(final_words))
        return (''.join(final_words))
    else:
        print('纠错失败')
        return None
コード例 #6
0
ファイル: proc.py プロジェクト: zpeng1989/chinese_correct_wsd
         for line in fin:
             i = i + 1
             if not i % 1000:
                 print("C:%d" % (i))
             if not line.find('\\2-grams:'):
                 start_flag = 1
                 continue
             if not line.find('\\end\\'):
                 continue
             if start_flag:
                 aa = line.split()
                 pinyin_t = ""
                 if aa:
                     item_key = aa[1] + aa[2]
                     if aa[1] == '<s>' or aa[1] == '</s>':
                         pinyin_t = aa[1] + '-' + pinyin.word2pinyin_split(
                             aa[2], '-')
                     elif aa[2] == '<s>' or aa[2] == '</s>':
                         pinyin_t = pinyin.word2pinyin_split(
                             aa[1], '-') + '-' + aa[2]
                     else:
                         pinyin_t = pinyin.word2pinyin_split(
                             aa[1], '-') + '-' + pinyin.word2pinyin_split(
                                 aa[2], '-')
                     if pinyin_t in JIEBA_PINYIN.keys():
                         JIEBA_PINYIN[pinyin_t].append(item_key)
                     else:
                         JIEBA_PINYIN[pinyin_t] = [item_key]
 #保存
 print("保存JIEBA词频信息")
 with open(FILE_NAME_JIEBA_PINYIN, 'wb') as fout:
     pickle.dump(JIEBA_PINYIN, fout, True)