def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append(''.join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u' ' or c == u'\t': close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def split_into_sentences(line): tokens = [] en_token = [] def close_token(token): if token: tokens.append("".join(token)) del (token[:]) for c in line: if is_terminator(c): # close current token if not tokens: continue close_token(en_token) tokens.append(c) yield tokens tokens = [] elif is_punct(c): close_token(en_token) tokens.append(c) elif is_zh(c): close_token(en_token) tokens.append(c) elif c == u" " or c == u"\t": close_token(en_token) else: en_token.append(c) if tokens: yield tokens
def st_trainMatrix(trainfile): with open(trainfile) as fin: for line in fin: line = line.strip() line_items = line.split() for item in line_items: if hanzi_util.is_terminator(item) or ( len(item) ==1 and hanzi_util.is_punct(item) ): line_items.remove(item); # whether exists elements if not line_items: continue # BEMS encode # line_hits <--> line_items # 进行字符和处理结果的对应 line_hits = [] # every char status for i_index in range(len(line_items)): if len(line_items[i_index]) == 1: line_hits += 'S' else: for j_index in range(len(line_items[i_index])): if j_index == 0: line_hits += 'B' elif j_index == len(line_items[i_index]) - 1: line_hits += 'E' else: line_hits += 'M' if len(''.join(line_items)) != len(line_hits): print("EEEEEEE %d<->%d" %(len(''.join(line_items)),len(line_hits))); #print(''.join(line_items)) #print(line_hits) line_items = ''.join(line_items) for i in range(len(line_hits)-1): # for calc trans matrix P[I][J] count_trans[line_hits[i]][line_hits[i+1]] += 1 for i in range(len(line_hits)-1): # for calc mixed_matrix if line_items[i] not in count_mixed[line_hits[i]].keys(): count_mixed[line_hits[i]][line_items[i]] = 1 else: count_mixed[line_hits[i]][line_items[i]] += 1 for (k_i, v_i) in count_trans.items(): count = sum(v_i.values()) for (k_j, v_j) in v_i.items(): P_transMatrix[k_i][k_j] = v_j / count for (k_i, v_i) in count_mixed.items(): for item in enumo: if item not in v_i.keys(): count_mixed[k_i][item] = 1 #针对没有出现的词,将其出现频次设置为1 for (k_i, v_i) in count_mixed.items(): count = sum(v_i.values()) for (k_j, v_j) in v_i.items(): P_mixedMatrix[k_i][k_j] = (v_j +1) / count #添加1进行平滑 return
def split_to_sentnces(lst): ret = [] len_t = len(lst) fro = 0 for i in range(len_t): if is_terminator(lst[i]): ret.append(lst[fro:i]) fro = i + 1 return ret