def test_json(): pt.initialize() res = [pt.lcut(s) for s in sentences] j = json.dumps(res, cls=PairJSONEncoder) print(j) k = json.loads(j, cls=PairJSONDecoder) print(k)
def __init__(self, model='jieba'): self.model = model if model.lower() == 'jieba': import jieba.posseg as posseg posseg.initialize() self.segmentor = posseg.POSTokenizer(tokenizer=None) elif model.lower() == 'ictclas': import pynlpir pynlpir.open() self.segmentor = pynlpir else: raise NotImplementedError
# jieba.load_userdict('/home/gyzhang/projects/cFrontEnd/data/dicts/dict_name.dict') # from jieba import posseg import time # import pdb import json, uuid, http.client, urllib.parse import pycantonese as pc from utils import * import jyutping from collections import OrderedDict from linguistic_dict import Linguistic_DICT from jieba import posseg import tensorflow as tf from aip import AipNlp from pypinyin import pinyin, Style, style # from hanziconv import HanziConv posseg.initialize(dictionary='../data/dicts/simple_dict.txt') ld = Linguistic_DICT() logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Configuration parameters for front end class FrontEnd(object): def __init__(self, ): # project and dictionary file paths self.project_path = '/home/gyzhang/projects/cFrontEnd' self.wav_folder = '/home/gyzhang/speech_database/cuprosody/Wave' self.name = "cuprosody"
# @Time : 18-9-28 下午1:47 # @Author : duyongan # @FileName: text_utils.py # @Software: PyCharm import re from simple_pickle import utils from text_process.text import Text import nltk import os import numpy as np from jieba import posseg # from cppjieba_py import posseg # from numba import jit posseg.initialize() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') here = os.path.dirname(__file__) stopwords = utils.read_pickle(here + '/stopwords') idf_map = utils.read_pickle(here + '/idf_map') def text2sencents_zh(text): text = re.sub('\u3000|\r|\t|\xa0', '', text) text = re.sub('?”|!”|。”', '”', text) sentences = re.split("([。!?……])", text) sentences.append('') sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])] last_sentences = [] for sentence in sentences: last_sentences += [
# combine similarity scores _similarity_smooth = lambda x, y, z, u: (x * y) + z - u _flat_sum_array = lambda x: np.sum(x, axis=0) # 分子 ''' tokenizer settings ''' tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt') if "SYNONYMS_WORDSEG_DICT" in ENVIRON: if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]): print("info: set wordseg dict with %s" % tokenizer_dict) tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"] else: print("warning: can not find dict at [%s]" % tokenizer_dict) print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict) _tokenizer.initialize(tokenizer_dict) # stopwords _fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt') def _load_stopwords(file_path): ''' load stop words ''' global _stopwords if sys.version_info[0] < 3: words = open(file_path, 'r') else: words = open(file_path, 'r', encoding='utf-8') stopwords = words.readlines()
""" import jieba from jieba.kvdict import Kvdict import sys if __name__ == "__main__": freq_dict = Kvdict("word_freq.db") tag_dict = Kvdict("word_tag.db") jieba.dt.initialize() freq_dict.convert_value = lambda x: x if x is None else int(x) jieba.dt.FREQ = freq_dict from jieba import posseg as pg pg.initialize() pg.dt.word_tag_tab = tag_dict # import pdb # pdb.set_trace() pg.dt.add_word("上海电力股份有限公司",1000000,"n") for line in sys.stdin: line = line.strip() print line,jieba.dt.FREQ[line], for x, y in pg.cut(line.strip()): print (u"(%s,%s)" % (x, y)).encode("utf8"), print freq_dict.close() tag_dict.close()
import jieba.posseg as pseg pseg.initialize() def multilingual_sent_split(texts): print('\nOriginal texts: ', texts) lingual_split_sign = {'x', 'eng'} final_parts = [] sub_part = [] cuts = pseg.lcut(texts) for idx in range(len(cuts) - 1): # 如果当前位置的词语词性和下一个词词性相同,则把当前位置上的词添加进当前的sub_part中 if (cuts[idx].flag in lingual_split_sign and cuts[idx + 1].flag in lingual_split_sign) or ( cuts[idx].flag not in lingual_split_sign and cuts[idx + 1].flag not in lingual_split_sign): sub_part.append(cuts[idx].word) # 否则就应该把当前的sub_part添加进final_parts中,且要新建sub_part else: sub_part.append(cuts[idx].word) final_parts.append(sub_part) sub_part = [] # 最后一个词如果和倒数第二个词词性相同,则把最后一个词添加进当前的sub_part中 if (cuts[-1].flag in lingual_split_sign and cuts[-2].flag in lingual_split_sign) or ( cuts[-1].flag not in lingual_split_sign and cuts[-2].flag not in lingual_split_sign): sub_part.append(cuts[-1].word) # 最后一个词如果和倒数第二个词词性不相同,则把最后一个词作为新的sub_part添加进final_parts中 else: final_parts.append([cuts[-1].word])
import logging import re import jieba # jieba.initialize() # (optional) # jieba.load_userdict('/home/gyzhang/projects/cFrontEnd/data/dicts/dict_name.dict') # from jieba import posseg import time # import pdb import json,uuid,http.client, urllib.parse import pycantonese as pc import jyutping from collections import OrderedDict from linguistic_dict import Linguistic_DICT from jieba import posseg import tensorflow as tf posseg.initialize(dictionary='../data/dicts/dict_name.dict') ld = Linguistic_DICT() logging.basicConfig(level=logging.INFO) # Configuration parameters for front end class CFrontEnd(object): def __init__(self, ): # project and dictionary file paths self.project_path = '/home/gyzhang/projects/cFrontEnd' self.wav_folder='/home/gyzhang/speech_database/cuprosody/Wave' self.name = "cuprosody" self.text_file = os.path.join(self.project_path, "exp", self.name, 'train/cn_text.txt') # this is test file out of this domain self.test_text_file = os.path.join(self.project_path,"exp",self.name,"train/cn_text_test.txt") self.mld = Linguistic_DICT()
def test_textrank(): s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后," + \ "吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产" + \ "开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业" + \ "收入0万元,实现净利润-139.13万元。" import jieba.analyse res = jieba.analyse.textrank(s, topK=20, withWeight=True, withFlag=True) #print(res) a = ['公司', '全资', '子公司', '吉林', '欧亚', '置业', '有限公司', '增资', '注册资本', '增加', '经营范围', '开发', '百货', '零售', '业务', '在建', '城市', '商业', '综合体', '项目', '实现', '营业', '收入', '净利润'] b = ["实现", "零售", "注册资本", "营业", "置业", "城市", "业务", "欧亚", "开发", "百货", "增资", "收入", "子公司", "吉林", "项目", "全资", "商业", "经营范围", "综合体", "在建", "公司", "净利润", "有限公司"] print(set(a)-set(b)) print(set(b)-set(a)) if __name__ == '__main__': pt.initialize() res = pt.lcut(sentences[10]) res = pt.lcut(sentences[10], HMM=False) #print(res) #task_test_sentences_cut() #task_test_sentences_cut_noHMM() #task_test_book_cut() #task_test_book_cut_noHMM() test_textrank()