예제 #1
0
def segmentation_graph(filepath, seg, lfreq, pfreq):
    sent_list = read(filepath).split('\n')
    for sent in sent_list:
        if sent != "\r\n":
            sent = '<BOS>' + sent + '<EOS>'
        DAG = get_DAG(sent, lfreq)
        route = {}
        temp_seg = calc_bigram_by_graph(sent, DAG, route, lfreq, pfreq)
        seg.append(temp_seg)
예제 #2
0
def segmentation_mr(filepath, seg, lfreq, ltotal):
    sent_list = read(filepath).split('\n')
    for sent in sent_list:
        N = len(sent)
        DAG = get_DAG(sent, lfreq)
        # print(DAG)
        route = {}
        temp_seg = []
        calc_max_route(sent, DAG, route, lfreq, ltotal)
        start = 0
        while start < N:
            end = route[start][1]
            temp_seg.append(sent[start:end + 1])
            start = end + 1
        seg.append(temp_seg)
예제 #3
0
def build_pfdict(dic_path):
    lfreq = {}  # 保存前缀词典中的词和词频
    ltotal = 0  # 保存总词数
    str_dic = read(dic_path).split('\n')
    for line in str_dic:
        # 保存离线词典中的词和词频
        word, freq = line.split(' ')[0:2]
        freq = int(freq)
        lfreq[word] = freq
        ltotal += freq
        # 对于离线词典中的每个词,获取其前缀词
        for i in range(len(word)):
            wfrag = word[:i + 1]
            if wfrag not in lfreq:
                lfreq[wfrag] = 0
    return lfreq, ltotal
예제 #4
0
def segmentation_bigram(filepath, seg, lfreq, pfreq):
    '''
    根据route分词
    '''
    prewords = {}
    keys = pfreq.keys()
    for key in keys:
        prewords[key] = [word for word in pfreq[key].keys()]

    sent_list = read(filepath).split('\n')
    for sent in sent_list:
        if sent == '':
            seg.append([])
            continue
        if sent != "":
            sent = '<BOS>' + sent + '<EOS>'
        N = len(sent) - 5
        DAG = get_DAG(sent, lfreq)
        forward_DAG = get_forward_DAG(DAG)
        route = {}
        temp_seg = []
        calc_bigram(sent, DAG, forward_DAG, route, lfreq, pfreq,
                    prewords)  # 建立这句话的route
        # print(route)

        pos0 = pos1 = 0
        for key in route.keys():
            if key[0] == 4:
                pos0 = key[1]
                pos1 = route[key][1]
                break
        temp_seg.append(sent[pos0:pos1])
        while pos1 != N:
            value = route[(pos0, pos1)]
            pos0 = pos1
            pos1 = value[1]
            temp_seg.append(sent[pos0:pos1])
        seg.append(temp_seg)