def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append(''.join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u' ' or c == u'\t': close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append("".join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u" " or c == u"\t": close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def st_trainMatrix(trainfile): with open(trainfile) as fin: for line in fin: line = line.strip() line_items = line.split() for item in line_items: if hanzi_util.is_terminator(item) or ( len(item) ==1 and hanzi_util.is_punct(item) ): line_items.remove(item); # whether exists elements if not line_items: continue # BEMS encode # line_hits <--> line_items # 进行字符和处理结果的对应 line_hits = [] # every char status for i_index in range(len(line_items)): if len(line_items[i_index]) == 1: line_hits += 'S' else: for j_index in range(len(line_items[i_index])): if j_index == 0: line_hits += 'B' elif j_index == len(line_items[i_index]) - 1: line_hits += 'E' else: line_hits += 'M' if len(''.join(line_items)) != len(line_hits): print("EEEEEEE %d<->%d" %(len(''.join(line_items)),len(line_hits))); #print(''.join(line_items)) #print(line_hits) line_items = ''.join(line_items) for i in range(len(line_hits)-1): # for calc trans matrix P[I][J] count_trans[line_hits[i]][line_hits[i+1]] += 1 for i in range(len(line_hits)-1): # for calc mixed_matrix if line_items[i] not in count_mixed[line_hits[i]].keys(): count_mixed[line_hits[i]][line_items[i]] = 1 else: count_mixed[line_hits[i]][line_items[i]] += 1 for (k_i, v_i) in count_trans.items(): count = sum(v_i.values()) for (k_j, v_j) in v_i.items(): P_transMatrix[k_i][k_j] = v_j / count for (k_i, v_i) in count_mixed.items(): for item in enumo: if item not in v_i.keys(): count_mixed[k_i][item] = 1 #针对没有出现的词,将其出现频次设置为1 for (k_i, v_i) in count_mixed.items(): count = sum(v_i.values()) for (k_j, v_j) in v_i.items(): P_mixedMatrix[k_i][k_j] = (v_j +1) / count #添加1进行平滑 return
def prep_word_dict(): CURRENT_W = None with open(IN_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break if line[:4] == '[DDv' : CURRENT_W = line[5: line.index(']')] term_to_id(CURRENT_W) continue if CURRENT_W and line[0] == '【' and ('=】' in line): line_x = line[line.index('】')+1:] line_x = line_x.split() if line_x: for item in line_x: term_to_id(item) continue LINE_NUM = 0 with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM)) if len(line) > 30: continue seg_list = pynlpir.segment(line, pos_tagging=False) for i in range(len(seg_list)): if is_zhs(seg_list[i]): term_to_id(seg_list[i]) elif len(seg_list[i]) == 1 and is_punct(seg_list[i]): seg_list[i] = PUNCING else: seg_list[i] = PADDING fout.write(' '.join(seg_list) + '\n') term_to_id(PADDING) #term_to_id(PUNCING) print('SEN DONE!')
def prep_word_dict(): CURRENT_W = None with open(IN_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" % (LINE_NUM)) continue if not line: print("PROCESS DONE!") break if line[:4] == '[DDv': CURRENT_W = line[5:line.index(']')] term_to_id(CURRENT_W) continue if CURRENT_W and line[0] == '【' and ('=】' in line): line_x = line[line.index('】') + 1:] line_x = line_x.split() if line_x: for item in line_x: term_to_id(item) continue LINE_NUM = 0 with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout: while True: try: line = fin.readline() except: print("READ ERROR:%d" % (LINE_NUM)) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM)) if len(line) > 30: continue seg_list = pynlpir.segment(line, pos_tagging=False) for i in range(len(seg_list)): if is_zhs(seg_list[i]): term_to_id(seg_list[i]) elif len(seg_list[i]) == 1 and is_punct(seg_list[i]): seg_list[i] = PUNCING else: seg_list[i] = PADDING fout.write(' '.join(seg_list) + '\n') term_to_id(PADDING) #term_to_id(PUNCING) print('SEN DONE!')