コード例 #1
0
ファイル: preprocess.py プロジェクト: tianwengang/open-gram
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append(''.join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens: continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u' ' or c == u'\t':
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
コード例 #2
0
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append("".join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens:
                continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u" " or c == u"\t":
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
コード例 #3
0
ファイル: segment.py プロジェクト: zxlmufc/chinese_nlp
def st_trainMatrix(trainfile):
	with open(trainfile) as fin:
		for line in fin:
			line = line.strip()
			line_items = line.split()
			for item in line_items:
				if hanzi_util.is_terminator(item) or ( len(item) ==1 and hanzi_util.is_punct(item) ):
					line_items.remove(item);
			# whether exists elements
			if not line_items:
				continue
			# BEMS encode
			# line_hits  <-->  line_items
			# 进行字符和处理结果的对应
			line_hits = []	# every char status
			for i_index in range(len(line_items)):
				if len(line_items[i_index]) == 1:
					line_hits += 'S'
				else:
					for j_index in range(len(line_items[i_index])):
						if j_index == 0:
							line_hits += 'B'
						elif j_index == len(line_items[i_index]) - 1:
							line_hits += 'E'
						else:
							line_hits += 'M'
			if len(''.join(line_items)) != len(line_hits):
				print("EEEEEEE %d<->%d" %(len(''.join(line_items)),len(line_hits)));
			#print(''.join(line_items))
			#print(line_hits)
			line_items = ''.join(line_items)

			for i in range(len(line_hits)-1):
				# for calc trans matrix P[I][J]
				count_trans[line_hits[i]][line_hits[i+1]] += 1
			for i in range(len(line_hits)-1):
				# for calc mixed_matrix 
				if line_items[i] not in count_mixed[line_hits[i]].keys():
					count_mixed[line_hits[i]][line_items[i]] = 1
				else:
					count_mixed[line_hits[i]][line_items[i]] += 1

	for (k_i, v_i) in count_trans.items():
		count = sum(v_i.values())
		for (k_j, v_j) in v_i.items():
			P_transMatrix[k_i][k_j] = v_j / count
    
	for (k_i, v_i) in count_mixed.items():
		for item in enumo:
			if item not in v_i.keys():
				count_mixed[k_i][item] = 1	#针对没有出现的词,将其出现频次设置为1

	for (k_i, v_i) in count_mixed.items():
		count = sum(v_i.values())
		for (k_j, v_j) in v_i.items():
			P_mixedMatrix[k_i][k_j] = (v_j +1) / count #添加1进行平滑	

	return 
コード例 #4
0
ファイル: disamb_v2.py プロジェクト: zxlmufc/chinese_nlp
def split_to_sentnces(lst):
    ret = []
    len_t = len(lst)
    fro = 0
    for i in range(len_t):
        if is_terminator(lst[i]):
            ret.append(lst[fro:i])
            fro = i + 1
    return ret
コード例 #5
0
ファイル: disamb_v2.py プロジェクト: Sandy4321/chinese_nlp
def split_to_sentnces(lst):
    ret = []
    len_t = len(lst)
    fro = 0
    for i in range(len_t):
        if is_terminator(lst[i]):
            ret.append(lst[fro:i])
            fro = i + 1
    return ret