Example #1
0
class segment:
    def __init__(self):
        LTP_DATA_DIR = 'resources/ltp_data_v3.4.0/'
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')

        from pyltp import Segmentor
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(cws_model_path,
                                         '/path/to/your/lexicon')

    def seg(self, text):
        words = self.segmentor.segment(text)
        return words

    def destroy(self):
        self.segmentor.release()

    def segFile(self, infile, outfile):
        data = codecs.open(infile, 'r')
        out = codecs.open(outfile, 'w')  #, 'utf-8'
        for line in data:
            fields = line.strip().split('\t')
            out.write(fields[0] + '\t' + '\t'.join(
                [' '.join(self.seg(fields[i]))
                 for i in range(1, len(fields))]) + '\n')
        data.close()
        out.close()
Example #2
0
def feature_about():
    # 获取特征列表
    feature_dict = NewsUtil.get_feature()
    # 获取新闻中出现特征后最近的5个词及其属性
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,
                                CFETSFX_LEXICON_PATH)  # 加载模型,第二个参数是您的外部词典文件路径
    feature_about_list = list()
    for rowN in range(0, raw_news_rows):
        news_content = raw_news_table.cell_value(rowN, 2)
        sentences = SentenceSplitter.split(news_content)
        for sentence in sentences:
            print(sentence)
            # 分词
            words = segmentor.segment(sentence)
            print(list(words))
            for word_index in range(0, len(words)):
                word = words[word_index]
                for feature_word in feature_dict.values():
                    if feature_word in word:
                        about_list = list()
                        count = 0
                        while word_index < len(words) and count < 6:
                            about_list.append(words[word_index])
                            count += 1
                            word_index += 1
                        feature_about_list.append(about_list)
                        print(about_list)
                        break
    segmentor.release()
    CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
Example #3
0
def pyltp_cutting(sentence):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    result = segmentor.segment(sentence)  # 分词
    #print ('\t'.join(words))
    segmentor.release()  # 释放模型
    return result
def genData():
    path = "/home/liberty/Sentiment/sentiment-data/pnn_annotated.txt"
    MODELDIR = "/home/liberty/ltp_data"
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    posList = []
    senList = []
    with open(path, "r") as file:
        with open("/home/liberty/Sentiment/sentiment-data/After.txt",
                  "w") as out:
            with open("/home/liberty/Sentiment/sentiment-data/Pos.txt",
                      "w") as posOut:
                cnt = 0
                for line in file.readlines():
                    random.seed(cnt * 10)
                    pos, sentence = line.split("\t")
                    words = list(segmentor.segment(sentence))
                    if cnt < 2500:
                        length = len(words)
                        unks = int(length * 0.1)
                        for i in range(unks):
                            idx = random.randint(0, length - 1)
                            words[idx] = "UNK"
                    senList.append(words)
                    posList.append(eval(pos))
                    out.write(" ".join(words) + "\n")
                    posOut.write(pos + "\n")
                    cnt += 1
            segmentor.release()
    return posList, senList
Example #5
0
class LTP:
    def __init__(self):
        self.segmentor = Segmentor()  # 分词器
        self.segmentor.load_with_lexicon(
            Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH)  # 加载模型
        self.postagger = Postagger()  # 词性分析器
        self.postagger.load(Config.POSTAGGER_PATH)  # 加载模型
        self.parser = Parser()  # 句法分析器
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH)
        self.parser.load(Config.PARSER_PATH)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 语义角色分析器
        self.labeller.load(Config.LABELLER_PATH)  # 加载模型
        self.negative_list = get_negative_list()
        self.no_list = get_no_list()
        self.limit_list = get_limit_list()
        self.special_list = get_special_list()
        self.key_sentences = []

    def __del__(self):
        """
        资源释放
        """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.labeller.release()
def pyltp_cut(sentence):

    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(sentence)  # 分
    segmentor.release()  # 释放模型
    return words
Example #7
0
    def cut_process(Questioning_path, new_word_path='./data/new_word.txt'):
        cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model')
        segmentor = Segmentor()
        segmentor.load(cws_model_path)
        with open(Questioning_path, 'r', encoding='utf8') as f:
            lines = f.readlines()
        with open(config.stopword_path, 'r', encoding='utf8') as f:
            stopword_list = f.readlines()

        new_word = []
        for line in lines:
            words = segmentor.segment(line.replace(' ', ''))

            words_list_temp = list(words)
            words_list = []
            for w in words_list_temp:
                if w not in stopword_list:
                    words_list.append(w)

            for i in range(len(words_list) - 1):
                if len(words_list[i]) == 1 and len(words_list[i + 1]) == 1:
                    w = words_list[i] + words_list[i + 1]
                    if w not in new_word:
                        new_word.append(w)
            Logger.log_DEBUG.debug('分词结果:' + str(words_list))
        segmentor.release()
        Logger.log_DEBUG.debug('新词:' + str(new_word))
        fw = open(new_word_path, 'w', encoding='utf8')
        for w in new_word:
            fw.write(w + '\n')
        fw.close()
        return new_word
Example #8
0
def words_split():
    """ 对于句子进行分词

    :return:
    """
    segmentor = Segmentor()
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    segmentor.load_with_lexicon(cws_model_path, '../data/all_word_dict.txt')
    for sentence in sentences:
        words = segmentor.segment(sentence)
        postags = postaggers(words)
        index = 0
        for word, postag in zip(words, postags):
            if postag == 'v':
                relation_words.append(word)
                # print(word)
        all_words.append(words)
    relation_words_file = open('relation_words.txt', 'w+', encoding='utf8')
    for word in relation_words:
        relation_words_file.write(word + '\n')
    # 将当前扫描的所有词加入file
    all_words_file = open('all_words.txt', 'w+', encoding='utf8')
    for words in all_words:
        temp_words = '\t'.join(words)
        all_words_file.write(temp_words + '\n')
    segmentor.release()
Example #9
0
def locationNER(text):
    #先分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    #print ('\t'.join(words))
    segmentor.release()

    #再词性标注
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()  # 释放模型

    #最后地理实体识别

    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for i in range (0,len(netags)):
       if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]:
           results.append(words[i-1]+words[i]+words[i+1])
       if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]:
           results.append(words[i])
    return results
def preprocess_data(train_mode=True, fine=True, remove_stopwords=False):
    print("Initializing Segmentor!")
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    if remove_stopwords:
        get_stop_words()
        print(len(stop_words))
    text, y = [], []
    if train_mode:
        path = TRAIN_DATA_PATH
    else:
        path = TEST_DATA_PATH
    for line in open(path, 'r', encoding='utf-8'):
        tmp = line.split('\t')
        assert len(tmp) == 2, "Something wrong with the data!"
        if fine:
            tag, question = tmp[0], tmp[1]
        else:
            tag, question = tmp[0].split('_')[0], tmp[1]
        if remove_stopwords:
            pred_words = remove_stop_words(list(segmentor.segment(question)))
        else:
            pred_words = list(segmentor.segment(question))
        seg_text = ''
        for word in pred_words:
            seg_text += word + ' '
        text.append(seg_text)
        y.append(tag)
    segmentor.release()
    return text, y
Example #11
0
def segmentor(sentence):
    segmentor = Segmentor()
    segmentor.load('/home/pengbin/下载/ltp_data_v3.4.0/cws.model')
    words = segmentor.segment(sentence)
    words_list = list(words)
    segmentor.release()
    return words_list
Example #12
0
def cut(string):
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    words = segmentor.segment(string)
    # print('\t'.join(words))
    segmentor.release()
    return words
Example #13
0
class LtpLanguageAnalysis(object):
    def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(model_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(model_dir, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(model_dir, "parser.model"))

    def analyze(self, text):
        # 分词
        words = self.segmentor.segment(text)
        print '\t'.join(words)

        # 词性标注
        postags = self.postagger.postag(words)
        print '\t'.join(postags)

        # 句法分析
        arcs = self.parser.parse(words, postags)
        print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
def cut_words():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('resource_new.txt', 'r', encoding='utf-8')
    f = open('key/cut_resouce.txt', 'w', encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('module/cws.model',
                                'userdict.txt')  # 加载模型,加载用户字典
    postagger = Postagger()  # 初始化实例
    postagger.load('module/pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() != '':
            words = segmentor.segment(sentence)  # 分词
            pos_tags = postagger.postag(words)  # 词性标注
            for word, tag in zip(words, pos_tags):
                if tag != 'wp':
                    f.write(word)
                else:
                    f.write('\n')
            f.write('\n')
        else:
            continue
    f.close()
    segmentor.release()
    postagger.release()
Example #15
0
def demo_three():
    string = '这个把手该换了,我不喜欢日本和服,别把手放在我的肩膀上,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作'
    segmentor = Segmentor()
    segmentor.load(sg_model_path)
    ret = segmentor.segment(string)
    print('/'.join(ret))
    segmentor.release()
Example #16
0
def extract_views(all_sents):
    segmentor = Segmentor()
    segmentor.load(r'/home/student/project-01/ltp_data/cws.model')
    postagger = Postagger()
    postagger.load(r'/home/student/project-01/ltp_data/pos.model')
    parser = Parser()
    parser.load(r'/home/student/project-01/ltp_data/parser.model')
    views_in_sents = []
    for i, sents in enumerate(all_sents):
        views_tmp = []
        for sent in sents:
            sent = sent.replace('\\n', '\n').strip()
            if len(sent) == 0:
                continue
            # words = list(jieba.cut(sent))
            words = list(segmentor.segment(sent))
            contains = contain_candidates(words)
            if len(contains) == 0:
                continue
            tags = list(postagger.postag(words))
            arcs = list(parser.parse(words, tags))
            sbv, head = get_sbv_head(arcs, words, tags)
            if sbv[0] is None or head[0] is None or head[0] not in contains:
                continue
            subj = sbv[0]
            view = clean_view(words[head[1] + 1:])
            views_tmp.append((subj, view, i))
        if len(views_tmp) > 0:
            views_in_sents.append({'sents': sents, 'views': views_tmp})
    segmentor.release()
    postagger.release()
    parser.release()
    return views_in_sents
def get_words_list(string):

    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    words_list = list(segmentor.segment(string))
    segmentor.release()
    return words_list
Example #18
0
class pyltp_model():
    def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'):
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.segmentor = Segmentor()  # 初始化实例
        self.postagger = Postagger()  # 初始化实例
        self.recognizer = NamedEntityRecognizer()  # 初始化实例

        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer.load(ner_model_path)  # 加载模型

    def token(self, sentence):
        words = self.segmentor.segment(sentence)  # 分词
        words = list(words)
        postags = self.postagger.postag(words)  # 词性标注
        postags = list(postags)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        netags = list(netags)
        result = []
        for i, j in zip(words, netags):
            if j in ['S-Nh', 'S-Ni', 'S-Ns']:
                result.append(j)
                continue
            result.append(i)
        return result

    def close(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()  # 释放模型
Example #19
0
def cut_word(string):
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    words = segmentor.segment(string)
    segmentor.release()

    return ' '.join(words)
def ltp_seg(s):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(s)# 分词
    s = " ".join(words)
    segmentor.release()  # 释放模型
    return s
Example #21
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Example #23
0
 def cut_words(self):
     print "plot:", self.plot
     segmentor = Segmentor()  # 初始化实例
     segmentor.load('ltp_data/cws.model')  # 加载模型
     self.words = segmentor.segment(self.plot)
     print '\t'.join(self.words)
     segmentor.release()  # 释放模型
Example #24
0
class LtpTree(DepTree):
    def __init__(self, dict_path=None):
        super(DepTree, self).__init__()
        print("正在加载LTP模型... ...")
        self.segmentor = Segmentor()
        if dict_path is None:
            self.segmentor.load(os.path.join(MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path)
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, "parser.model"))
        print("加载模型完毕。")

    def parse(self, sentence):
        self.words = self.segmentor.segment(sentence)
        self.postags = self.postagger.postag(self.words)
        self.arcs = self.parser.parse(self.words, self.postags)
        for i in range(len(self.words)):
            if self.arcs[i].head == 0:
                self.arcs[i].relation = "ROOT"

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
Example #25
0
def sent_split(sentence):
    segmentor = Segmentor()
    segmentor.load(seg_model_path)
    words = segmentor.segment(sentence)
    segmentor.release()
    print('\t'.join(words))
    return words
Example #26
0
    def ltp_word(self):
        """创建一个方法,用来进行句子的分词、词性分析等处理。"""
        # 分词
        segmentor = Segmentor()
        segmentor.load(os.path.join(MODELDIR, "cws.model"))
        words = segmentor.segment(self.content)
        #print("*************分词*****************")
        #print("\t".join(words))

        # 词性标注
        postagger = Postagger()
        postagger.load(os.path.join(MODELDIR, "pos.model"))
        postags = postagger.postag(words)
        #print("*************词性标注*************")
        #print(type(postags))
        #print("\t".join(postags))

        # 依存句法分析
        parser = Parser()
        parser.load(os.path.join(MODELDIR, "parser.model"))
        arcs = parser.parse(words, postags)
        #print("*************依存句法分析*************")
        #print(type(arcs))
        #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        # 把依存句法分析结果的head和relation分离出来
        arcs_head = []
        arcs_relation = []
        for arc in arcs:
            arcs_head.append(arc.head)
            arcs_relation.append(arc.relation)

        # 命名实体识别
        recognizer = NamedEntityRecognizer()
        recognizer.load(os.path.join(MODELDIR, "ner.model"))
        netags = recognizer.recognize(words, postags)
        #print("*************命名实体识别*************")
        #print("\t".join(netags))
        """
        # 语义角色标注
        labeller = SementicRoleLabeller()
        labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        roles = labeller.label(words, postags, arcs)
        print("*************语义角色标注*************")
        for role in roles:
            print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
        """

        segmentor.release()
        postagger.release()
        parser.release()
        recognizer.release()
        #labeller.release()

        # 调用list_conversion函数,把处理结果列表化
        words_result = list_conversion(words, postags, netags, arcs_head,
                                       arcs_relation)

        return words_result
Example #27
0
def seg(input_file, output_file):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型

    data = json.load(open(input_file, "r", encoding="utf-8"))
    count = 0
    for d in data:
        count += 1
        print(count)
        d['article_content'] = list(segmentor.segment(d['article_content']))
        d['article_title'] = list(segmentor.segment(d['article_title']))

        questions = d["questions"]
        for q in questions:
            q['answer'] = list(segmentor.segment(
                q['answer']))  # 其实没什么用,现在是用answer_span进行计算
            q['question'] = list(segmentor.segment(q['question']))
            # 根据字级别的answer_span 生成词语级别的
            if len(q['answer']) == 0 or len(q['question']) == 0:
                continue
            answer_span_char_level = q['answer_span']
            answer_span_word_level = answer_span_char2word(
                d['article_content'], answer_span_char_level)
            q['answer_span'] = answer_span_word_level

    segmentor.release()  # 释放模型
    json.dump(data,
              open(output_file, "w", encoding="utf-8"),
              ensure_ascii=False)
Example #28
0
 def get_word_list(self, sentence, model):
     # 得到分词
     segmentor = Segmentor()
     segmentor.load(model)
     word_list = list(segmentor.segment(sentence))
     segmentor.release()
     return word_list
Example #29
0
class Parse_Util(object):
    def __init__(self, lexicon_path='./data/lexicon'):
        # 分词
        self.segmentor = Segmentor()
        # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path)
        self.segmentor.load(cws_model_path)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(par_model_path)
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)
        # jieba 分词
        # jieba.load_userdict(lexicon_path)

    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    # 解析句子
    def parse_sentence(self, sentence):
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs)

        return words, postags, netags, arcs
def cut_sentence(file_in):
    cutwords_list = []  #清零
    file_original_txt = open(file_in, 'r', encoding='utf-8')
    stopwords = [
        line.rstrip() for line in open('stopwords', encoding='utf-8')
    ]  #rstrip() 删除 str末尾的指定字符(默认为空格)

    segmentor = Segmentor()
    segmentor.load('cws.model')  #加载模型
    sentences = file_original_txt.readlines()

    for sente in sentences:
        temp = ''  #用来存放被切分后的“句子”
        sente = str(sente).encode('utf-8').decode(
            'utf-8-sig')  #编码、解码。否则label会认为是 '\ufeff' 非法字符
        label = str(sente[0:1])  #去label
        temp += label + '\t'
        sente = sente[2:]  #去label
        words = segmentor.segment(sente)  #分词,类型为 pyltp.VectorOfString
        word_list = list(words)  #收纳在list中
        for word in word_list[1:]:
            if word not in stopwords:
                temp += word + ' '
        cutwords_list.append(temp)

    segmentor.release()  #释放模型
    file_original_txt.close()
    return cutwords_list
Example #31
0
def process(index):

	ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
	sys.path.append(os.path.join(ROOTDIR, "lib"))

	# Set your own model path
	MODELDIR=os.path.join(ROOTDIR, "ltp_data")

	segmentor = Segmentor()
	segmentor.load(os.path.join(MODELDIR, "cws.model"))

	finname = "o_"+str(index)+".txt"
	foutname = "p_"+str(index)+".txt"
	print finname
	count = 0
	fin = codecs.open(finname, encoding='utf-8')
	with codecs.open(foutname, 'w', encoding="utf-8") as fout:
		while 1:
			line = fin.readline()
			if not line:
			    break
			tmp = line.split(" ^ {")[1] # Get JSON
			tmp = "{"+tmp
			data = json.loads(tmp)
			content = data['content']
			# error_correction(content)
			content = content.strip()
			segmentation = ""
			for line in content.split("\n"):
				line = line.encode("utf-8")
				words = segmentor.segment(line)
				segmentation += "/".join(words)
				segmentation += "/"

			# Return type of the function is str, not unicode. Thus need to change into unicode.
			segmentation = unicode(segmentation, "utf-8")
			pinyin = add_pinyin(segmentation)
			obj = {}
			obj['flavor'] = data['flavor']
			obj['environment'] = data['environment']
			obj['service'] = data['service']
			obj['content'] = data['content']
			obj['segmentation'] = segmentation
			obj['pinyin'] = pinyin
			tmpstr = json.dumps(obj,ensure_ascii=False)
			fout.write(tmpstr)
			fout.write('\n')
			count += 1
			print count
		segmentor.release()
Example #32
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Example #33
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()
Example #34
0
    def ws_data(self):
        f = open("pnn_annotated.txt", 'r')
        total_line = 0
        orgin_attr = [0, 0, 0]
        judge_attr = [0, 0, 0]
        right = [0, 0, 0]
        segmentor = Segmentor()
        segmentor.load("cws.model")
        for line in f:
            total_line += 1
            # print 'line has been read'
            value_num = [0, 0]
            result = line.split('\t')
            ws_lst = segmentor.segment(result[1])
            # print 'this line is %s' % (line)

            for i in ws_lst:
                classify = ''
                try:
                    value = self.setiment_words[i]
                except:
                    pass
                else:
                    if value == 1:
                        print 'positive word:%s' % i
                        value_num[0] += 1
                    elif value == -1:
                        print 'negative word:%s' % i
                        value_num[1] += 1

            if value_num[0] == 0 and value_num[1] == 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] == value_num[1] != 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] > value_num[1]:
                classify = 'positive'
                judge_attr[1] += 1
            else:
                classify = 'negative'
                judge_attr[2] += 1

            print value_num
            print 'classfiy result:%s' % classify

            # the count of original'emotion
            if result[0] == '0':
                orgin_attr[0] += 1
            elif result[0] == '1':
                orgin_attr[1] += 1
            else:
                orgin_attr[2] += 1

            if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0):
                # print 'positive'
                right[1] += 1
            elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0):
                # print 'negative'
                right[2] += 1

            # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        print 'orgin\'s neutral, positive, negative'
        print orgin_attr

        print 'judge_attr neutral, positive, negative'
        print judge_attr

        print 'neutral, positive, negative'
        print right
        print (right[0] + right[1] + right[2])

        print 'total_line %f\n' % total_line
        print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        segmentor.release()