def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = pypinyin.lazy_pinyin(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) return result
def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def segment(self, pic, pipe, text, lower=False, use_jieba=False): text = util.as_text(text) sentences = self.ss.segment( text) # Sentences is a sentence with end delimiters . words_no_filter = self.ws.segment_sentences( pic, pipe, sentences=sentences, lower=lower, use_stop_words=False, use_speech_tags_filter=False, use_jieba=use_jieba) words_no_stop_words = self.ws.segment_sentences( pic, pipe, sentences=sentences, lower=lower, use_stop_words=True, use_speech_tags_filter=False, use_jieba=use_jieba) words_all_filters = self.ws.segment_sentences( pic, pipe, sentences=sentences, lower=lower, use_stop_words=True, use_speech_tags_filter=True, use_jieba=use_jieba) return util.AttrDict(sentences=sentences, words_no_filter=words_no_filter, words_no_stop_words=words_no_stop_words, words_all_filters=words_all_filters)
def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open(HZ2PY_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = round( math.log(emission[hanzi][pinyin] / num_sum), 6) data['default'] = round(math.log(1.e-200), 6) data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False): """对一段文本进行分词,返回list类型的分词结果 Keyword arguments: lower -- 是否将单词小写(针对英文) use_stop_words -- 若为True,则利用停止词集合来过滤(去掉停止词) use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。 """ text = util.as_text(text) jieba_result = pseg.cut(text) if use_speech_tags_filter == True: jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter] else: jieba_result = [w for w in jieba_result] # 去除特殊符号 word_list = [w.word.strip() for w in jieba_result if w.flag!='x'] word_list = [word for word in word_list if len(word)>0] if lower: word_list = [word.lower() for word in word_list] if use_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def segment(self, text): res = [util.as_text(text)] for sep in self.delimiters: text, res = res, [] for seq in text: res += seq.split(sep) res = [s.strip() for s in res if len(s.strip()) > 0] return res
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = PinyinHelper.convertToPinyinFromSentence(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) if ',' in ''.join(result): print(s) print(''.join(result)) sys.exit() return result
def __init__(self, stop_words_file=None, allowed_speech_tags=allowed_speech_tags): allowed_speech_tags = [util.as_text(item) for item in allowed_speech_tags] self.default_tag_filter = allowed_speech_tags self.stop_words = set() self.stop_words_file = get_default_stop_words_file() if type(stop_words_file) is str: self.stop_words_file = stop_words_file for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'): self.stop_words.add(word.strip())
def segment(self, text): res = [util.as_text(text)] # 返回句子 util.debug(res) util.debug(self.delimiters) for sep in self.delimiters: text, res = res, [] for seq in text: res += seq.split(sep) # split() 通过指定分隔符对字符串进行切片 res = [s.strip() for s in res if len(s.strip()) > 0] return res
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def gen_py2hz(): data = {} for line in open(PY2HZ_FILE): line = util.as_text(line.strip()) ls = line.split('=') if len(ls) != 2: raise Exception('invalid format') py, chars = ls py = py.strip() chars = chars.strip() if len(py) > 0 and len(chars) > 0: data[py] = chars writejson2file(data, FIN_PY2HZ_FILE)
def extract_chinese_sentences(content): content = util.as_text(content) content = content.replace(' ', '') content = content.replace('\t', '') sentences = [] s = '' for c in content: if util.is_chinese(c): s += c else: sentences.append(s) s = '' sentences.append(s) return [s.strip() for s in sentences if len(s.strip()) > 1]
def __init__(self, stop_words_file = None, allow_speech_tags = util.allow_speech_tags): """ Keyword arguments: stop_words_file -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词 allow_speech_tags -- 词性列表,用于过滤 """ allow_speech_tags = [util.as_text(item) for item in allow_speech_tags] self.default_speech_tag_filter = allow_speech_tags self.stop_words = set() self.stop_words_file = get_default_stop_words_file() if type(stop_words_file) is str: self.stop_words_file = stop_words_file for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'): self.stop_words.add(word.strip())
def keywords(self, text, n): text = text.replace('\n', '') text = text.replace('\r', '') text = utils.as_text(text) tokens = utils.cut_sentences(text) sentences, sents = utils.psegcut_filter_words(tokens, self.__stop_words, self.__use_stopword) word_index, index_word, words_number = self.build_vocab(sents) graph = self.create_graph(sents, words_number, word_index, window=self.__window) scores = utils.weight_map_rank(graph, max_iter=self.__max_iter, tol=self.__tol) sent_selected = nlargest(n, zip(scores, count())) sent_index = [] for i in range(n): sent_index.append(sent_selected[i][1]) return [index_word[i] for i in sent_index]
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num / _base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num
def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False): text = util.as_text(text) jieba_result = pseg.cut(text) if use_tag_filter: jieba_result = [ w for w in jieba_result if w.flag in self.default_tag_filter] else: jieba_result = [w for w in jieba_result] word_list = [w.word.strip() for w in jieba_result if w.flag != 'x'] word_list = [word for word in word_list if len(word) > 0] if lower: word_list = [word.lower() for word in word_list] if strip_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def segment(self, text, lower = False): text = util.as_text(text) sentences = self.ss.segment(text) words_no_filter = self.ws.segment_sentences(sentences=sentences, lower = lower, use_stop_words = False, use_speech_tags_filter = False) words_no_stop_words = self.ws.segment_sentences(sentences=sentences, lower = lower, use_stop_words = True, use_speech_tags_filter = False) words_all_filters = self.ws.segment_sentences(sentences=sentences, lower = lower, use_stop_words = True, use_speech_tags_filter = True) return util.AttrDict( sentences = sentences, words_no_filter = words_no_filter, words_no_stop_words = words_no_stop_words, words_all_filters = words_all_filters )
def summarize(self, text): text = text.replace('\n', '') text = text.replace('\r', '') text = util.as_text(text) #处理编码问题 tokens = util.cut_sentences(text) #sentences用于记录文章最原本的句子,sents用于各种计算操作 sentences, sents = util.cut_filter_words(tokens, self.__stop_words, self.__use_stopword) if self.__use_w2v: sents = self.filter_dictword(sents) graph = self.create_graph_sentence(sents, self.__use_w2v) scores = util.weight_map_rank(graph, self.__max_iter, self.__tol) num = len(scores) if num > 1: n = 1 sent_selected = nlargest(n, zip(scores, count())) else: n = num sent_selected = nlargest(n, zip(scores, count())) sent_index = [] for i in range(n): sent_index.append(sent_selected[i][1]) # 添加入关键词在原来文章中的下标 return [sentences[i] for i, counts in range(n) if sentences[i] != '']
import sys import codecs def segment(self, text): res = [util.as_text(text)] # 返回句子 util.debug(res) util.debug(self.delimiters) for sep in self.delimiters: text, res = res, [] for seq in text: res += seq.split(sep) # split() 通过指定分隔符对字符串进行切片 res = [s.strip() for s in res if len(s.strip()) > 0] return res text = codecs.open('../test/doc/01.txt', 'r', 'utf-8').read() delimiters = set([util.as_text(item) for item in util.sentence_delimiters]) res = [util.as_text(text)] for sep in delimiters: text, res = res, [] for seq in text:#他传入的文章就有换行符,所以已经会分割了 res += seq.split(sep) # split() 通过指定分隔符对字符串进行切片 #print(res) res = [s.strip() for s in res if len(s.strip()) > 0]#去除掉换行符句子得到一个完整的句子列表
def segment(self, pic, pipe, text, lower=True, use_stop_words=True, use_speech_tags_filter=False, use_jieba=False): """对一段文本进行分词,返回list类型的分词结果 Keyword arguments: lower -- 是否将单词小写(针对英文) use_stop_words -- 若为True,则利用停止词集合来过滤(去掉停止词) use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。 """ text = util.as_text(text).split(",") if len(text) > 0: text = text[0] if use_jieba: jieba_result = pseg.cut(text) if use_speech_tags_filter == True: jieba_result = [ w for w in jieba_result if w.flag in self.default_speech_tag_filter ] else: jieba_result = [w for w in jieba_result] # 去除特殊符号 word_list = [w.word.strip() for w in jieba_result if w.flag != 'x'] word_list = [word for word in word_list if len(word) > 0] else: word2id_c, id2tag_c, word2id_p, id2tag_p, word2id_n, id2tag_n, zy = pic cws, pos = pipe.analyze(text, word2id_c, id2tag_c, zy, word2id_p, id2tag_p, word2id_n, id2tag_n) pos = [pos[i][1] for i in range(len(pos))] if use_speech_tags_filter == True: cws = [ cws[i] for i in range(len(pos)) if pos[i] in self.default_speech_tag_filter ] pos = [ pos[i] for i in range(len(pos)) if pos[i] in self.default_speech_tag_filter ] word_list = [ cws[i].strip() for i in range(len(pos)) if pos[i] != 'x' ] word_list = [word for word in word_list if len(word) > 0] if lower: word_list = [word.lower() for word in word_list] if use_stop_words: word_list = [ word.strip() for word in word_list if word.strip() not in self.stop_words ] return word_list
sys.setdefaultencoding('utf-8') except: pass SOURCE_FILE = '../data/train/original/hanzipinyin.txt' ALL_STATES_FILE = '../data/train/result/all_states.txt' # 汉字(隐藏状态) ALL_OBSERVATIONS_FILE = '../data/train/result/all_observations.txt' # 拼音(观测值) PINYIN2HANZI_FILE = '../data/train/result/pinyin2hanzi.txt' states = set() observations = set() py2hz = {} for line in open(SOURCE_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] states.add(hanzi) for pinyin in pinyin_list: observations.add(pinyin) py2hz.setdefault(pinyin, set()) py2hz[pinyin].add(hanzi) # 声母 shengmu = util.get_shengmu(pinyin) if shengmu is not None: py2hz.setdefault(shengmu, set())
def __init__(self, delimiters=util.sentence_delimiters): """ Keyword arguments: delimiters -- 可迭代对象,用来拆分句子 """ self.delimiters = set([util.as_text(item) for item in delimiters])