Example #1
0
class segment:
    def __init__(self):
        LTP_DATA_DIR = 'resources/ltp_data_v3.4.0/'
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')

        from pyltp import Segmentor
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(cws_model_path,
                                         '/path/to/your/lexicon')

    def seg(self, text):
        words = self.segmentor.segment(text)
        return words

    def destroy(self):
        self.segmentor.release()

    def segFile(self, infile, outfile):
        data = codecs.open(infile, 'r')
        out = codecs.open(outfile, 'w')  #, 'utf-8'
        for line in data:
            fields = line.strip().split('\t')
            out.write(fields[0] + '\t' + '\t'.join(
                [' '.join(self.seg(fields[i]))
                 for i in range(1, len(fields))]) + '\n')
        data.close()
        out.close()
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Example #3
0
class LTP:
    def __init__(self):
        self.segmentor = Segmentor()  # 分词器
        self.segmentor.load_with_lexicon(
            Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH)  # 加载模型
        self.postagger = Postagger()  # 词性分析器
        self.postagger.load(Config.POSTAGGER_PATH)  # 加载模型
        self.parser = Parser()  # 句法分析器
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH)
        self.parser.load(Config.PARSER_PATH)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 语义角色分析器
        self.labeller.load(Config.LABELLER_PATH)  # 加载模型
        self.negative_list = get_negative_list()
        self.no_list = get_no_list()
        self.limit_list = get_limit_list()
        self.special_list = get_special_list()
        self.key_sentences = []

    def __del__(self):
        """
        资源释放
        """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.labeller.release()
def run():
    #分词+选词
    cont = open('key/pinglun_filter_all1.txt','r',encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典
    postagger = Postagger() # 初始化实例
    postagger.load('pos.model')  # 加载模型
    nwordall = []
    for sentence in cont:
        nword = ['']
        words = segmentor.segment(sentence)  # 分词
        #默认可以这样输出
        # print (' '.join(words))
        postags = postagger.postag(words)  # 词性标注
        for word,tag in zip(words,postags):
            #############选择词性输出
            # print (word+'/'+tag)
            ############只选出副词
            # if tag == 'd':
            #######过滤单个字
            # if((tag == 'n'or tag == 'd' or tag == 'a') and len(word)>1):
            ############使用word2vec相似度计算找取跟名词相近的形容词
            # if((tag == 'a' or tag == 'n') and len(word)>1):
            if((tag == 'n') and len(word)>1):
                # print(word+tag)
                nword.append(word)
        nwordall.append(nword)
    #size为词向量维度数也即是特征值,windows窗口范围,min_count频数小于5的词忽略,workers是线程数,维度高会造成问题
    model = models.word2vec.Word2Vec(nwordall, size=10, window=5, min_count=100, workers=80)
    print('#############################################')
    sim = model.most_similar(positive=[u'餐饮'])
    for s in sim:
        print ("word:%s,similar:%s " %(s[0],s[1]))
Example #5
0
def feature_about():
    # 获取特征列表
    feature_dict = NewsUtil.get_feature()
    # 获取新闻中出现特征后最近的5个词及其属性
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,
                                CFETSFX_LEXICON_PATH)  # 加载模型,第二个参数是您的外部词典文件路径
    feature_about_list = list()
    for rowN in range(0, raw_news_rows):
        news_content = raw_news_table.cell_value(rowN, 2)
        sentences = SentenceSplitter.split(news_content)
        for sentence in sentences:
            print(sentence)
            # 分词
            words = segmentor.segment(sentence)
            print(list(words))
            for word_index in range(0, len(words)):
                word = words[word_index]
                for feature_word in feature_dict.values():
                    if feature_word in word:
                        about_list = list()
                        count = 0
                        while word_index < len(words) and count < 6:
                            about_list.append(words[word_index])
                            count += 1
                            word_index += 1
                        feature_about_list.append(about_list)
                        print(about_list)
                        break
    segmentor.release()
    CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
Example #6
0
def init_pyltp(model_dir, dict_file=None):
    '''
    初始化Pyltp的几个模块
    :param  model_dir   模型的路径
    :param  dict_file   分词的外部词典
    :return segmentor, postagger, parser, ner
    '''
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    ner = NamedEntityRecognizer()

    cws_model = os.path.join(model_dir, 'cws.model')
    pos_model = os.path.join(model_dir, 'pos.model')
    parser_model = os.path.join(model_dir, 'parser.model')
    ner_model = os.path.join(model_dir, 'ner.model')

    if dict_file:
        segmentor.load_with_lexicon(cws_model, dict_file)
    else:
        segmentor.load(cws_model)
    postagger.load(pos_model)
    ner.load(ner_model)
    parser.load(parser_model)
    return segmentor, postagger, parser, ner
Example #7
0
def get_tfidf_feature():
    '''获取tfidf特征
    先对每个样本进行分句,并且分句结果中的公司名称全部换成前一步中的公司简名
    Returns:
        整个样本的tfidf结果
    '''
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon('e:/ltp_data_v3.4.0/cws.model', '../data/user_dict.txt')  # 加载模型
    text = []
    for i, row in sample_data.iterrows():
        words = []
        sentence = row['sentence']
        start = 0
        end = 0
        for entity in row['ner']:
            end = entity[0]
            words.extend(segmentor.segment(sentence[start:end]))
            words.append(entity[3])
            start = entity[1] - 1
        if end < len(sentence):
            words.extend(segmentor.segment(sentence[start:len(sentence)]))
            
        text.append(' '.join(words))

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(text)
    
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf_feature = transformer.fit_transform(X.toarray())
    
    segmentor.release() 
    
    return tfidf_feature
def load_all_model():
    """返回分词,词性标注,命名实体识别,依存解析等实例对象"""
    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path, './temp_file/cut_external_dict/cut_external_dict')  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger = Postagger()  # 初始化实例
    postagger.load_with_lexicon(pos_model_path, './temp_file/pos_external_dict/pos_external_dict')  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    LTP_DATA_DIR = 'E:/MYGIT/Project/ltp_data'  # ltp模型目录的路径
    par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型

    fname = r"E:/MYGIT/model/wiki_stopwords/wiki_word2vec.kv"
    # model_wv.save(fname)
    model_wv = KeyedVectors.load(fname, mmap='r')
    return [segmentor, postagger, recognizer, parser, model_wv]
Example #9
0
class ModelLoader:
    __instance = None

    def __new__(cls):
        if cls.__instance is None:
            cls.__instance = super(ModelLoader, cls).__new__(cls)
            cls.__instance.__initialized = False
        return cls.__instance

    def __init__(self):
        if (self.__initialized): return
        self.__initialized = True
        LTP_DIR = "./ltp_data"
        #客製化分詞,並且後處理更改詞性
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DIR, "cws.model"),
            os.path.join(LTP_DIR, 'customized.txt'))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

        self.sentenceSplitter = SentenceSplitter()
Example #10
0
class LtpTree(DepTree):
    def __init__(self, dict_path=None):
        super(DepTree, self).__init__()
        print("正在加载LTP模型... ...")
        self.segmentor = Segmentor()
        if dict_path is None:
            self.segmentor.load(os.path.join(MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path)
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, "parser.model"))
        print("加载模型完毕。")

    def parse(self, sentence):
        self.words = self.segmentor.segment(sentence)
        self.postags = self.postagger.postag(self.words)
        self.arcs = self.parser.parse(self.words, self.postags)
        for i in range(len(self.words)):
            if self.arcs[i].head == 0:
                self.arcs[i].relation = "ROOT"

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
Example #11
0
def words_split():
    """ 对于句子进行分词

    :return:
    """
    segmentor = Segmentor()
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    segmentor.load_with_lexicon(cws_model_path, '../data/all_word_dict.txt')
    for sentence in sentences:
        words = segmentor.segment(sentence)
        postags = postaggers(words)
        index = 0
        for word, postag in zip(words, postags):
            if postag == 'v':
                relation_words.append(word)
                # print(word)
        all_words.append(words)
    relation_words_file = open('relation_words.txt', 'w+', encoding='utf8')
    for word in relation_words:
        relation_words_file.write(word + '\n')
    # 将当前扫描的所有词加入file
    all_words_file = open('all_words.txt', 'w+', encoding='utf8')
    for words in all_words:
        temp_words = '\t'.join(words)
        all_words_file.write(temp_words + '\n')
    segmentor.release()
def cut_words():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('resource_new.txt', 'r', encoding='utf-8')
    f = open('key/cut_resouce.txt', 'w', encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('module/cws.model',
                                'userdict.txt')  # 加载模型,加载用户字典
    postagger = Postagger()  # 初始化实例
    postagger.load('module/pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() != '':
            words = segmentor.segment(sentence)  # 分词
            pos_tags = postagger.postag(words)  # 词性标注
            for word, tag in zip(words, pos_tags):
                if tag != 'wp':
                    f.write(word)
                else:
                    f.write('\n')
            f.write('\n')
        else:
            continue
    f.close()
    segmentor.release()
    postagger.release()
Example #13
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Example #14
0
def read_and_seg_pos(file_dir):
    segmentor = Segmentor()
    # postagger = Postagger()
    segmentor.load_with_lexicon("f:\\NLPJP\\xlbz\\LTP\\ltp_data_v3.4.0\\cws.model","ceshi.txt")
    # postagger.load_with_lexicon("f:/NLPJP/xlbz/LTP/ltp_data_v3.4.0/pos.model","ceshi.txt")
    file_read = open(file_dir,"r")
    texts = file_read.readlines()
    file_write_seg = open(file_dir+"_seg","w")
    # file_write_pos = open(file_dir+"_pos","w")
    
	
	
	for text in texts:
        words = segmentor.segment(text)
        file_write_seg.write(" ".join(words)+"\n")
        # postags = postagger.postag(words)
        # words_and_pos.append('$','$')
        # for word,pos in words_and_pos:
        #     if word != '$':
        #         file_write_pos.write(word+" "+pos+" ")
        #     else:
        #         file_write_pos.write('\n')
    file_read.close()
    file_write_seg.close()
    # file_write_pos.close()
    segmentor.release()
    postagger.release()

    read_and_seg_pos()
Example #15
0
class Word():
    def __init__(self, dictDir):
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(f'{LTP_DATA_DIR}/cws.model',
                                         f'{dictDir}/dict.txt')

    def split(self, myStr):
        return list(self.segmentor.segment(myStr))
Example #16
0
    def segment(self, texts, use_tag_filter=True):
        # 初始化实例
        # global word_list, netags, postags, relation, heads
        words = []
        pos = []
        ner = []
        rel = []
        hea = []

        segmentor = Segmentor()
        segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic')  # 加载模型,参数是自定义词典的文件路径  self.dic_list

        postagger = Postagger()
        postagger.load(self.pos_model_path)

        recognizer = NamedEntityRecognizer()
        recognizer.load(self.ner_model_path)

        parser = Parser()
        parser.load(self.pas_model_path)

        for text in texts:
            text = text.lower()

            word_list = segmentor.segment(text)
            word_list = [word for word in word_list if len(word) > 1]
            # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None]  # .decode('utf8') 保留中英文
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]  # 去除停用词

            # 词性标注
            posttags = postagger.postag(word_list)
            postags = list(posttags)

            # NER识别
            netags = recognizer.recognize(word_list, postags)

            # 句法分析
            arcs = parser.parse(word_list, postags)
            rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
            relation = [arc.relation for arc in arcs]  # 提取依存关系
            heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id]  # 匹配依存父节点词语

            if use_tag_filter:
                dic = dict(zip(word_list, postags))
                word_list = [x for x in dic.keys() if dic[x] in self.tags_filter]

            words.append(word_list)
            pos.append(postags)
            ner.append(netags)
            rel.append(relation)
            hea.append(heads)

        segmentor.release()
        postagger.release()
        recognizer.release()
        parser.release()

        return words, pos, ner, rel, hea
Example #17
0
def ltp_segmentor(LTP_DATA_DIR, sentence):
    # 分词模型路径,模型名称为`cws.model`
    cws_model_path = os.path.join(LTP_DATA_DIR, "cws.model")
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(cws_model_path, "ltp_data/dict/school")
    words = segmentor.segment(sentence)  # 分词
    segmentor.release()  # 释放模型
    return words
Example #18
0
def seg_test(filepath, cwspath, dictpath):
    from pyltp import Segmentor
    segmentor = Segmentor()
    segmentor.load_with_lexicon(cwspath, dictpath)

    text = open(filepath).read()
    words = segmentor.segment(text)
    print('\t'.join(words))
    segmentor.release()
Example #19
0
def segment(text):
    global segmentor
    if segmentor is None:
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        segmentor = Segmentor()  # 初始化实例
        segmentor.load_with_lexicon(cws_model_path, 'dict/lexicon.txt') # 加载模型,第二个参数是您的外部词典文件路径
    words = segmentor.segment(text)  # 分词
    # print(list(words))
    return list(words)
Example #20
0
def test_dict():
    # self.segmentor.load(os.path.join(MODELDIR, "cws.model"))
    dictf = resource_path('dict_zh.txt')
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dictf)
    # segmentor.load(os.path.join(MODELDIR, "cws.model"))
    words = segmentor.segment('列出派工单')  # fail: '列出所有的采购订单'
    print('\t'.join(words))
    segmentor.release()
Example #21
0
def segmentor(sentence):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,r'D:\ltp\ltp_data_v3.4.0\user.dict')  # 加载模型
    words = segmentor.segment(sentence)  # 分词
    #默认可以这样输出
    print ('\t'.join(words))
    # 可以转换成List 输出
    words_list = list(words)
    segmentor.release()  # 释放模型
    return words_list
Example #22
0
def cut(str):
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    #segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(
        cws_model_path, 'lexicon')  # 可自定义单词,加载模型,参数./lexicon是自定义词典的文件路径
    words = segmentor.segment(str)  # 分词,str = "你好,我是大王"
    # print (' '.join(words))
    segmentor.release()  # 释放模型
    return words
Example #23
0
def cut(sent):
    segmentor = Segmentor()
    model_path = 'D:/app/ltp_data_v3.4.0/cws.model'
    user_dict = 'ds_dict.txt'
    segmentor.load_with_lexicon(model_path, user_dict)
    words = segmentor.segment(sent)
    print(words)
    array_str = "|".join(words)
    print(array_str)
    segmentor.release()  # 释放应用
    return
Example #24
0
def cut_sentence(input_sentence):
    segmentor = Segmentor()
    # segmentor.load_with_lexicon("../../../../../ltp_data/cws.model","../../../../../ltp_data/fulluserdict")
    segmentor.load_with_lexicon("/home/liu/ltp_data/cws.model","/home/liu/ltp_data/fulluserdict")
    # 分词模块,输入句子,输出分词,可考虑加入用户词典!
    # input_sentence = "王老师的办公室在哪里"
    words = segmentor.segment(input_sentence)

    result = ' '.join(words)
    result = 'BOS '+result+' EOS'
    return [result] 
Example #25
0
class MyPyLtp:
    # 本地配置ltp model路径
    LTP_DATA_DIR = '/Users/yuanjin/PycharmProjects/ltp_data_v3.4.0'  # ltp模型目录的路径
    # linux配置ltp model路径
    # LTP_DATA_DIR = '/home/student/project/project-01/ltp_data'
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    special_word_path = os.path.join(LTP_DATA_DIR, 'special_word.txt')
    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
    parser_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')

    # 初始化模型
    def __init__(self):
        # 初始化实例,加载模型
        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load_with_lexicon(
            self.cws_model_path,
            self.special_word_path)  # 加载模型,第二个参数是您的外部词典文件路径
        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)
        self.parser = Parser()
        self.parser.load(self.parser_model_path)

    # 分句
    def split_sentence(self, text):
        return SentenceSplitter.split(text)

    # 分词
    def split_word(self, sentence):
        self.words = list(self.segmentor.segment(sentence))  # 分词
        return self.words

    # 词性标注
    def tagging_word(self):
        self.tagging = self.postagger.postag(self.words)  # 词性标注
        return self.tagging

    # 命名实体识别
    def name_recognizer(self):
        self.names = self.recognizer.recognize(self.words,
                                               self.tagging)  # 命名实体识别
        return self.names

    # 依存句法分析
    def relation_analysis(self, sentence):
        self.split_word(sentence)
        self.tagging_word()
        self.name_recognizer()

        arcs = self.parser.parse(self.words, self.tagging)  # 句法分析
        return [[arc.head, arc.relation] for arc in arcs]
Example #26
0
 def get_cut_words(self, text, dict_path=None):
     # 分词
     segmentor = Segmentor()  # 初始化实例
     if dict_path is None:
         segmentor.load(self.cws_model_path)  # 加载模型
     else:
         segmentor.load_with_lexicon(self.cws_model_path,
                                     dict_path)  # 加载模型,第二个参数是您的外部词典文件路径
     words = segmentor.segment(text)
     print('\t'.join(words))
     segmentor.release()
     return list(words)
Example #27
0
 def get_words_by_pyltp(self, sent):
     words_list = list()
     # 分词模型路径,模型名称为’cws.model‘
     cws_model_path = os.path.join(self.ltp_dir_path, "cws.model")
     # dict是自定义词典的文件路径
     dict_path = os.path.join(self.ltp_dir_path, "dict.txt")
     segmentor = Segmentor()
     segmentor.load_with_lexicon(cws_model_path, dict_path)
     words = segmentor.segment(sent)
     segmentor.release()
     words_list = list(words)
     return words_list
Example #28
0
 def segmentor(self, sentence):
     segmentor = Segmentor()
     #segmentor.load(cws_model_path)  # 加载模型
     segmentor.load_with_lexicon(cws_model_path,
                                 user_dict_path)  # 加载模型,第二个参数是您的外部词典文件路径
     words = segmentor.segment(sentence)  # 分词
     # 默认可以这样输出
     #print ('\t'.join(words))
     # 可以转化成List输出
     word_list = list(words)
     segmentor.release()  # 释放模型
     return word_list
def ltp_segment(sent):
    # 加载文件
    cws_model_path = os.path.join(
        'ltp_data_v3.4.0/cws.model')  # 分词模型路径,模型名称为`cws.model`
    lexicon_path = os.path.join(
        'ltp_data_v3.4.0/lexicon.txt')  # 参数lexicon是自定义词典的文件路径
    segmentor = Segmentor()
    segmentor.load_with_lexicon(cws_model_path, lexicon_path)
    words = list(segmentor.segment(sent))
    segmentor.release()

    return words
def words_split():
    """ 对于句子进行分词

    :return:
    """
    segmentor = Segmentor()
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    segmentor.load_with_lexicon(cws_model_path, 'data/all_word_dict.txt')
    for sequence in sentences:
        words = segmentor.segment(sequence)
        tuple_get(words, sequence)
    segmentor.release()
Example #31
0
def seg_initialize(model_path, lexicon_path):
    print "load segment data..."
    segmentor = Segmentor()
    segmentor.load_with_lexicon(model_path, lexicon_path)
    return segmentor
    for line in path.readlines():
        emotion_set.append((line.strip().split('\t')[0]))
    return emotion_set

def sortByPMI(coPMI):
    sorted_tuple =[]
    for item in coPMI:
        items = item.split('\001')
        #print 'item:',items,type(items)
        #print coPMI[item],type(coPMI[item])
        sorted_tuple.append((items[0],items[1],coPMI[item]))
    return sorted(sorted_tuple,key =itemgetter(0,2)),sorted(sorted_tuple,key= itemgetter(1,2))


segmentor = Segmentor()
segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt")


if __name__ == "__main__":
    path = os.path.abspath(os.path.dirname(sys.argv[0]))
    path_property = open(path+"/car_entity_property.txt",'r')
    pro_words = fun_property_set(path_property)
    path_sentiment = open(path+"/car_sentiment_dic.txt",'r')
    sen_words = fun_emotion_set(path_sentiment)
    path_corpus = path+"/car_pmi_corpus.txt"
    path_out1 = open(path+"/pro_sen_pmi_corpus_sort1.txt",'w')
    
    path_out2 = open(path+"/pro_sen_pmi_corpus_sort2.txt",'w')
    
    posPmi = getPMI(path_corpus, pro_words, sen_words)
    sorted_tuple1,sorted_tuple2 = sortByPMI(posPmi)