def segmentation_graph(filepath, seg, lfreq, pfreq): sent_list = read(filepath).split('\n') for sent in sent_list: if sent != "\r\n": sent = '<BOS>' + sent + '<EOS>' DAG = get_DAG(sent, lfreq) route = {} temp_seg = calc_bigram_by_graph(sent, DAG, route, lfreq, pfreq) seg.append(temp_seg)
def segmentation_mr(filepath, seg, lfreq, ltotal): sent_list = read(filepath).split('\n') for sent in sent_list: N = len(sent) DAG = get_DAG(sent, lfreq) # print(DAG) route = {} temp_seg = [] calc_max_route(sent, DAG, route, lfreq, ltotal) start = 0 while start < N: end = route[start][1] temp_seg.append(sent[start:end + 1]) start = end + 1 seg.append(temp_seg)
def build_pfdict(dic_path): lfreq = {} # 保存前缀词典中的词和词频 ltotal = 0 # 保存总词数 str_dic = read(dic_path).split('\n') for line in str_dic: # 保存离线词典中的词和词频 word, freq = line.split(' ')[0:2] freq = int(freq) lfreq[word] = freq ltotal += freq # 对于离线词典中的每个词,获取其前缀词 for i in range(len(word)): wfrag = word[:i + 1] if wfrag not in lfreq: lfreq[wfrag] = 0 return lfreq, ltotal
def segmentation_bigram(filepath, seg, lfreq, pfreq): ''' 根据route分词 ''' prewords = {} keys = pfreq.keys() for key in keys: prewords[key] = [word for word in pfreq[key].keys()] sent_list = read(filepath).split('\n') for sent in sent_list: if sent == '': seg.append([]) continue if sent != "": sent = '<BOS>' + sent + '<EOS>' N = len(sent) - 5 DAG = get_DAG(sent, lfreq) forward_DAG = get_forward_DAG(DAG) route = {} temp_seg = [] calc_bigram(sent, DAG, forward_DAG, route, lfreq, pfreq, prewords) # 建立这句话的route # print(route) pos0 = pos1 = 0 for key in route.keys(): if key[0] == 4: pos0 = key[1] pos1 = route[key][1] break temp_seg.append(sent[pos0:pos1]) while pos1 != N: value = route[(pos0, pos1)] pos0 = pos1 pos1 = value[1] temp_seg.append(sent[pos0:pos1]) seg.append(temp_seg)