Beispiel #1
0
def segment(sentence, is_cut2char=False, enable_pos=False):
    """
    切词
    :param sentence:
    :param is_cut2char: False use jieba.lcut; True use list(sentence)
    :param enable_pos: bool, enable POS
    :return: list
    """
    import jieba
    from jieba import posseg

    jieba.setLogLevel(log_level="ERROR")
    if enable_pos:
        if not is_cut2char:
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        else:
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if not is_cut2char:
            return jieba.lcut(sentence)
        else:
            return list(sentence)
Beispiel #2
0
def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :return: list
    """
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)
Beispiel #3
0
def segment(sentence, cut_type='word', pos=False, None_flag='O'):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :param None_flag: 'BIO' the 'O'
    :return: list
    """
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)
Beispiel #4
0
def SegJieba(InfoGene):
    (name, yxsj, fj, yxzd) = next(InfoGene)
    print(name)
    keywords = get_key(yxsj)
    yxsj_vec = re.split(r'[,。;]+', yxsj)
    # 也可以自定义user-dict
    word_list = jieba.lcut(yxsj)
    freq_dist = nltk.FreqDist(word_list)
    print(freq_dist)
    for i in freq_dist:
        print(i)
    jieba.add_word(word="两肺", freq=None, tag='n')
    jieba.add_word(word="支气管壁", freq=None, tag='n')
    jieba.add_word(word="左肺", freq=None, tag='n')
    clinic_dict = {}
    discrip = ''
    for sent in yxsj_vec:
        print([(x.word, x.flag) for x in psg.lcut(sent)])
    for sent in yxsj_vec:
        for x in psg.lcut(sent):
            if x.word in keywords and x.flag == 'n':
                key = x.word
                discrip = clinic_dict.get(key, "")
            if x.word in keywords and (x.flag == 'a' or x.flag == 'v'):
                discrip = discrip + x.word
        clinic_dict[key] = discrip
        if discrip != "":
            print(key, clinic_dict[key])
Beispiel #5
0
def segment(sentence,cut_type='word',pos=False):
    """
    :param sentence:           后面要把数据集拼接为句子
    :param cut_type:           粒度:word和char级别
    :param pos:                是否标注词性,默认不标注
    :return:
    """
    if pos:#整体上从是否支持词性标注进行判断
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            '''
            注意cut和lcut的区别:
            cut是一个生成器(generator),需要通过for循环取其中的每一个词
            而lcut直接生成一个list
           '''
            word_seq,pos_seq = [],[]
            #定义两个list,第一个用于存放所分出的词,第二个用于存放其词性(如果pos=Ture)
            for w,p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq,pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)  #这里取pair中0号位置的flag(词性)
            return word_seq,pos_seq

    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':#由于粒度是char,可以直接用list返回
            return list(sentence)
def segment(sentence, cut_type='word', pos=False):
    seg_words = []
    seg_pos = []

    if cut_type == 'word':
        if pos == True:
            seg_word_pos = posseg.lcut(sentence)
            for word, pos in seg_word_pos:
                seg_words.append(word)
                seg_pos.append(pos)
            return seg_words, seg_pos
        elif pos == False:
            seg_words = jieba.lcut(sentence)
            return seg_words

    if cut_type == 'char':
        if pos == True:
            for char in sentence:
                seg_word_pos = posseg.lcut(char)
                for word, pos in seg_word_pos:
                    seg_words.append(word)
                    seg_pos.append(pos)
            return seg_words, seg_pos
        elif pos == False:
            for char in sentence:
                seg_words.append(char)
            return seg_words
Beispiel #7
0
    def getCosinSimilarity(self, str1, str2):
        soupfcontent1 = BeautifulSoup(str(str1), "html.parser")
        content_table1 = soupfcontent1.find_all('table')
        soupfcontent2 = BeautifulSoup(str(str2), "html.parser")
        content_table2 = soupfcontent2.find_all('table')

        cut_str1 = [
            w for w, t in posseg.lcut(str(content_table1))
            if 'n' in t or 'v' in t
        ]
        cut_str2 = [
            w for w, t in posseg.lcut(str(content_table2))
            if 'n' in t or 'v' in t
        ]
        # 列出所有词
        all_words = set(cut_str1 + cut_str2)
        # 计算词频
        freq_str1 = [cut_str1.count(x) for x in all_words]
        freq_str2 = [cut_str2.count(x) for x in all_words]
        # 计算相似度
        sum_all = sum(map(lambda z, y: z * y, freq_str1, freq_str2))
        sqrt_str1 = math.sqrt(sum(x**2 for x in freq_str1))
        sqrt_str2 = math.sqrt(sum(x**2 for x in freq_str2))
        cosin_similarity = sum_all / (sqrt_str1 * sqrt_str2)
        print cosin_similarity
Beispiel #8
0
def loadDocument(stopList):
    global docList
    docList = []
    for file in os.listdir(negPath):
        news = None
        with open(os.path.join(negPath,file),'r',encoding='utf-8') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')]
            news = list(jieba.cut(news))
            news = [word for word in news if (word not in stopList) and (word not in noun)]  # 过滤停用词和名词
        docList.append(news)
    for file in os.listdir(neuPath):
        news = None
        with open(os.path.join(neuPath,file),'r',encoding='utf-8') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')]
            news = list(jieba.cut(news))
            news = [word for word in news if (word not in stopList) and (word not in noun)]  # 过滤停用词和名词
        docList.append(news)
    for file in os.listdir(posPath):
        news = None
        with open(os.path.join(posPath,file),'r',encoding='utf-8') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')]
            news = list(jieba.cut(news))
            news = [word for word in news if (word not in stopList) and (word not in noun)]  # 过滤停用词和名词
        docList.append(news)
    return None
Beispiel #9
0
def segment(sentence: str, cut_type: str = 'word', pos: bool = False) -> list:
    """
    对句子进行分词操作。
    :param sentence: 需要分词的句子
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) 分词方式
    :param pos: enable POS 是否启用POS(词性标注)
    :return: list 分词后所有词组成的列表
    """
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':  # 按字符分隔
            word_seq = list(sentence)  # 把句子拆分为单个字组成的列表
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)  # 获取对应的词性 pos
            return word_seq, pos_seq
    else:
        if cut_type == 'word':  # 按词分隔
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)
Beispiel #10
0
def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :return: list
    """
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for i in word_seq:
                w_p = posseg.lcut(i)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)
Beispiel #11
0
    def _cut_jieba(row):
        """
		cut the sentences into tokens
		:param row:
		:return:
		"""
        cut_words = []
        cut_flags = []
        if '。' in row:
            row = row.split('。')
            for idx, s in enumerate(row):
                if idx != len(row) - 1:
                    s = s + '。'
                s_cut = list(pseg.lcut(s, HMM=False))
                cut_words.extend([c.word for c in s_cut])
                cut_flags.extend([c.flag for c in s_cut])
        else:
            s_cut = list(pseg.lcut(row, HMM=False))
            cut_words = [c.word for c in s_cut]
            cut_flags = [c.flag for c in s_cut]

        new_row = pd.Series()
        new_row['tokens'] = cut_words
        new_row['flags'] = cut_flags
        return new_row
Beispiel #12
0
def loadWords(stopList):
    global wordsList
    wordsSet = set()
    for file in os.listdir(negPath):
        news = None
        with open(os.path.join(negPath,file),'r',encoding='utf-8',errors='ignore') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表
            news = set(jieba.cut(news))
            news = {word for word in news if (word not in stopList) and (word not in noun)}  # 过滤停用词和名词
        wordsSet = news | wordsSet # 取集合并集
    # 最后要使用list类型,因为要保证结果的有序性
    wordsList = list(wordsSet)
    for file in os.listdir(neuPath):
        news = None
        with open(os.path.join(neuPath,file),'r',encoding='utf-8',errors='ignore') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表
            news = set(jieba.cut(news))
            news = {word for word in news if (word not in stopList) and (word not in noun)}  # 过滤停用词和名词
        wordsSet = news | wordsSet # 取集合并集
    # 最后要使用list类型,因为要保证结果的有序性
    wordsList = list(wordsSet)
    for file in os.listdir(posPath):
        news = None
        with open(os.path.join(posPath,file),'r',encoding='utf-8',errors='ignore') as f:
            news = f.read()
            noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表
            news = set(jieba.cut(news))
            news = {word for word in news if (word not in stopList) and (word not in noun)}  # 过滤停用词和名词
        wordsSet = news | wordsSet # 取集合并集
    # 最后要使用list类型,因为要保证结果的有序性
    wordsList = list(wordsSet)
    return None
Beispiel #13
0
 def tokenizer(self, text):
     if type(text) is list:
         result = list()
         for s in text:
             result.append(pseg.lcut(s))
         return result
     else:
         return pseg.lcut(text)
Beispiel #14
0
def simicos(str1, str2):
    cut_str1 = [w for w, t in posseg.lcut(str1) if t != 'x']
    cut_str2 = [w for w, t in posseg.lcut(str2) if t != 'x']
    if cut_str1 != [] and cut_str2 != []:
        all_words = set(cut_str1 + cut_str2)
        freq_str1 = [cut_str1.count(x) for x in all_words]
        freq_str2 = [cut_str2.count(x) for x in all_words]
        sum_all = sum(map(lambda z, y: z * y, freq_str1, freq_str2))
        sqrt_str1 = math.sqrt(sum(x ** 2 for x in freq_str1))
        sqrt_str2 = math.sqrt(sum(x ** 2 for x in freq_str2))
        return sum_all / (sqrt_str1 * sqrt_str2)
    else:
        return 0
def process_postag(text):
    word, pos = [], []
    for w, p in posseg.lcut(text):
        word += [w] * len(w)
        pos += [p] * len(w)

    return word, pos
Beispiel #16
0
def delNOTNeedWords(content,customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result=''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等
            result += tempword # +"/"+str(w.flag)+" "  #去停用词
            return_words.append(tempword)
    return result,return_words
    def split_words(s):
        # 繁体转简体
        s = SplitWords.__convert(s)

        # 去除标签
        s = SplitWords.__del_non_tag(s)

        # 去除标点符号
        s = SplitWords.__del_punctuation(s)

        # 去除数字
        s = SplitWords.__del_digit(s)

        # 分词 带有词性
        words = pseg.lcut(s, HMM=True)
        # 重新编码 UTF-8
        words = SplitWords.__reencoding(words)

        # 去掉中文停用词
        words = SplitWords.__del_stop(words,
                                      SplitWords.__read_chinese_stoplist())

        # 去掉英文停用词
        words = SplitWords.__del_stop(words,
                                      SplitWords.__read_english_stoplist())

        # 去掉多余的空格
        words = SplitWords.__del_blank(words)

        # 去掉无用的词性词汇,并将剩下的词汇的词性删除
        words = SplitWords.__del_non_pos(words)

        return words
Beispiel #18
0
    def parse_by_rules(self, text):
        self.words = pseg.lcut(parse_cn_number(text), HMM=False)
        while self.has_next():
            beginning = self.get_index()

            self.consume_year_period() \
                or self.consume_month_period() \
                or self.consume_day_period()

            self.consume_weekday_period() \
                or self.consume_hour_period() \
                or self.consume_minute_period()

            self.consume_year() \
                or self.consume_month() \
                or self.consume_day()

            self.consume_hour()

            if self.get_index() != beginning:
                # Time found
                self.consume_word(u'准时')
                if self.consume_word(u'提醒'):
                    self.consume_word(u'我')
                if self.current_tag() == 'v' and self.peek_next_word() == u'我':
                    self.advance(2)
                self.consume_to_end()
                return Remind(time=self.now, desc=text, event=self.do_what)
            else:
                self.advance()
        return None
Beispiel #19
0
    def maxSimTxt(self, intxt, simCondision=0.15, simType='simple'):
        '''
        找出知识库里的和输入句子相似度最高的句子

        :param intxt: 输入文本
        :param simCondision: 相似度阈值
        :param simType:
        :return:
        '''
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt的simType类型不存在: {}'.format(simType)

        # 如果没有加载词向量,那么降级成 simple_pos 方法
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(
                intxt)

            t.sim = max(
                similarity(
                    in_vec, question, method=simType, embedding=embedding)
                for question in questions)
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return [''], ''

        return maxSim.q, maxSim.a
Beispiel #20
0
 def _cut_word(self, comment):  #分词
     word_pairs = posseg.lcut(comment, HMM=False)
     result = []
     for t in word_pairs:
         if not (t.word in result or t.word in self.stop_words):
             result.append(t.word)
     return '/'.join(result)
Beispiel #21
0
 def participle(self, raw_sentence):
     """对原始语句分词,去标点,返回两个列表,第一个为分词结果,第二个为词性列表"""
     m = []
     n = []
     # 年龄处理
     age_list = re.findall("\d+岁.*?月|\d+岁半|\d+岁|\d+年级|[一二三四五六七八九]年级", raw_sentence)
     # 日期时间处理
     time_list = re.findall("\d+号上午\d+点|\d+号下午\d+点|\d+号上午|\d+号下午|\d+号晚上|\d+号|\d+[::]\d+", raw_sentence)
     total = age_list + time_list
     for i in total:
         jieba.add_word(i)
     for i, j in pseg.lcut(raw_sentence):  # 去标点
         if i not in self.stopwords:
             m.append(i)
             n.append(j)
     # 把地址合在一起,例如将['北京市','海淀区','西土城路']合称为'北京市海淀区西土城路'
     index = []
     for i in range(len(n)):
         if n[i] == 'ns':
             index.append(i)
     if len(index) > 1:
         for i in range(index[-1]-index[0]):
             m[index[0]] += m[index[0]+i+1]
             m[index[0]+i+1] = ''
             n[index[0]+i+1] = ''
         x, y = [], []
         for i in m:
             if i != '':
                 x.append(i)
         for i in n:
             if i != '':
                 y.append(i)
     else:
         x, y = m, n
     return x, y
Beispiel #22
0
def delNOTNeedWords(content, customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        import os

        file_stop_words = "stopwords.txt"
        if os.path.exists(file_stop_words):
            stop_words = codecs.open(file_stop_words, encoding="UTF-8").read()
            customstopwords = stop_words

    result = ""
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        if word not in customstopwords and flag[0]:
            # in [u'n', u'f', u'a',  u'v', u'd',u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等
            result += word.encode("utf-8")  # +"/"+str(w.flag)+" "  #去停用词
            return_words.append(word.encode("utf-8"))
    return result, return_words
Beispiel #23
0
    def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
        """
        找出知识库里的和输入句子相似度最高的句子
        simType=simple, simple_POS, vec
        """
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt的simType类型不存在: {}'.format(simType)

        # 如果没有加载词向量,那么降级成 simple_pos 方法
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(
                intxt)

            t.sim = max(
                similarity(
                    in_vec, question, method=simType, embedding=embedding)
                for question in questions)
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return '抱歉,我没有理解您的意思。请您询问有关汽车的话题。'

        return maxSim.a
Beispiel #24
0
async def _(session: NLPSession):
    # 去掉消息首尾的空白符
    stripped_msg = session.msg_text.strip()
    # 对消息进行分词和词性标注
    words = posseg.lcut(stripped_msg)
    dates = {'今天': 0,
             '明天': 1,
             '后天': 2
             }
    searchInfo={'city': None,
                'date': None}
    # 遍历 posseg.lcut 返回的列表
    for word in words:
        # 每个元素是一个 pair 对象,包含 word 和 flag 两个属性,分别表示词和词性
        if word.flag == 'ns' and searchInfo['city'] is None:
            # ns 词性表示地名
            searchInfo['city'] = word.word
        if word.flag == 't' and searchInfo['date'] is None:
            if word.word in dates:

                searchInfo['date'] = dates[word.word]
        if (not searchInfo['city'] is None) and ( not searchInfo['date'] is None):
            break
    if searchInfo['date'] is None:
        searchInfo['date'] =0

    # 返回意图命令,前两个参数必填,分别表示置信度和意图命令名
    return IntentCommand(80.0, 'weather', current_arg=json.dumps(searchInfo) or '')
def multilingual_sent_split(texts):
    print('\nOriginal texts: ', texts)
    lingual_split_sign = {'x', 'eng'}
    final_parts = []
    sub_part = []
    cuts = pseg.lcut(texts)
    for idx in range(len(cuts) - 1):
        # 如果当前位置的词语词性和下一个词词性相同,则把当前位置上的词添加进当前的sub_part中
        if (cuts[idx].flag in lingual_split_sign
                and cuts[idx + 1].flag in lingual_split_sign) or (
                    cuts[idx].flag not in lingual_split_sign
                    and cuts[idx + 1].flag not in lingual_split_sign):
            sub_part.append(cuts[idx].word)
        # 否则就应该把当前的sub_part添加进final_parts中,且要新建sub_part
        else:
            sub_part.append(cuts[idx].word)
            final_parts.append(sub_part)
            sub_part = []
    # 最后一个词如果和倒数第二个词词性相同,则把最后一个词添加进当前的sub_part中
    if (cuts[-1].flag in lingual_split_sign
            and cuts[-2].flag in lingual_split_sign) or (
                cuts[-1].flag not in lingual_split_sign
                and cuts[-2].flag not in lingual_split_sign):
        sub_part.append(cuts[-1].word)
    # 最后一个词如果和倒数第二个词词性不相同,则把最后一个词作为新的sub_part添加进final_parts中
    else:
        final_parts.append([cuts[-1].word])
    if sub_part:
        final_parts.append(sub_part)
    final_strs = [''.join(_l) for _l in final_parts]
    print('Cut texts: ', final_strs)
    return final_strs
def filter_stop_words(content):
    result = []  # 最终返回结果
    words = pseg.lcut(content)  # 分词
    for word in words:
        if word.word.strip() not in stop_words and word.flag[0] in [u'n']:
            result.append(word.word.strip().encode('utf-8'))
    return result
    def split_words(s):
        # 繁体转简体
        s = SplitWords.__convert(s)

        # 去除标签
        s = SplitWords.__del_non_tag(s)

        # 去除标点符号
        s = SplitWords.__del_punctuation(s)

        # 去除数字
        s = SplitWords.__del_digit(s)

        # 分词 带有词性
        words = pseg.lcut(s, HMM=True)
        # 重新编码 UTF-8
        words = SplitWords.__reencoding(words)

        # 去掉中文停用词
        words = SplitWords.__del_stop(words, SplitWords.__read_chinese_stoplist())

        # 去掉英文停用词
        words = SplitWords.__del_stop(words, SplitWords.__read_english_stoplist())

        # 去掉多余的空格
        words = SplitWords.__del_blank(words)

        # 去掉无用的词性词汇,并将剩下的词汇的词性删除
        words = SplitWords.__del_non_pos(words)

        return words
Beispiel #28
0
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要词性标注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count)
 def extract_keyword(self, text, number):
     """
     抽取关键词
     :param text: 输入文本
     :param number: 抽取的关键词数量
     :return: 返回重要成都排前number的关键词
     """
     graph = TextRank()
     occu2num = defaultdict(int)
     seg_list = psg.lcut(text)
     for i, pair in enumerate(seg_list):
         if pair.flag[0] in self.tag and len(pair.word) > 1:
             for j in range(i + 1, i + 1 + self.span):
                 if j >= len(seg_list):
                     break
                 if seg_list[j].flag[0] not in self.tag or len(seg_list[j].word) < 2:
                     continue
                 if (seg_list[j].word, pair.word) in occu2num:
                     occu2num[(seg_list[j].word, pair.word)] += 1
                 else:
                     occu2num[(pair.word, seg_list[j].word)] += 1
     for key, value in occu2num.items():
         graph.add_edge(key[0], value, key[1])
     node_rank = graph.build_rank()
     node_rank = sorted(node_rank.items(), key=lambda x: x[1], reverse=True)
     return node_rank[:number]
def analyse_wordVector(model, name_list, sentence):
    for name in name_list:
        print('{}的词向量为:\n{}'.format(name, model[name]))
        print('与{}最相关的词:{}'.format(name, model.most_similar(name)))
        topn = 3  # 查看跟'令狐冲'相关性前三的词
        print('跟{}相关性前{}的词:\n{}'.format(name, topn,
                                        model.similar_by_word(name,
                                                              topn=topn)))
        print('跟{}关系相当于师妹跟林平之的关系的词:\n{}'.format(
            name, model.most_similar(['师妹', '林平之'], [name], topn=topn)))
        print('跟{}关系相当于师妹跟圣姑的关系的词:\n{}'.format(
            name, model.most_similar(['师妹', '圣姑'], [name], topn=topn)))
        #u"令狐冲 任盈盈 林平之 岳不群 东方不败"
    a, b = '令狐冲', '师妹'
    print('集合{}中不同类的词语:{}'.format(
        name_list,
        model.wv.doesnt_match(u"令狐冲 任盈盈 林平之 岳不群 东方不败".split())))  # 选出集合中不同类的词语
    print('{}和{}之间的相关度:{}'.format(a, b, model.wv.similarity(a,
                                                            b)))  # 两个词语之间的相关度

    #分词后对词的属性进行分析
    sentence = poss.lcut(sentence)
    # cut()分词,返回一个生成器generator,可通过迭代的方法访问各个分词
    # lcut()返回的是list,list(jieba.cut())等价与jieba.lcut()
    print(sentence)  # nr:人名 r:代词 v:动词
    print('测试句子中的人名有:',
          [list(i)[0] for i in sentence if list(i)[1] == 'nr'])  # ['林平之']
Beispiel #31
0
def delNOTNeedWords(content, customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords,
                                 encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result = ''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword) > 0 and flag in [
                u'n', u'nr', u'ns', u'nt', u'nz', u'ng', u't', u'tg', u'f',
                u'v', u'vd', u'vn', u'vf', u'vx', u'vi', u'vl', u'vg', u'a',
                u'an', u'ag', u'al', u'm', u'mq', u'o', u'x'
        ]):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等
            result += tempword  # +"/"+str(w.flag)+" "  #去停用词
            return_words.append(tempword)
    return result, return_words
Beispiel #32
0
def cut_word(raw_data):
    cut_result = []
    prbl_cl = list(raw_data)
    for entry in prbl_cl:
        word_cut = psg.lcut(entry)
        cut_result += word_cut
    return cut_result
    def cut_sentence(self,
                     sentence,
                     by_word=True,
                     use_stop=True,
                     with_sg=False):
        """
        with_sg : 有无词性
        """
        assert by_word != True or with_sg != True, '单个字分词没有词性'
        if by_word is True:
            return self._cut_by_word(sentence)

        else:
            jb_content = psg.lcut(sentence)

            if use_stop is True:
                # 判断是否存在停用词
                jb_content = [
                    i for i in jb_content if i.word not in self.stop_word
                ]
            if with_sg is True:
                jb_content = [(i.word, i.flag) for i in jb_content]

            else:
                jb_content = [i.word for i in jb_content]

        return jb_content
Beispiel #34
0
def string2json(text, save_path=None):
    # input: text is a string
    # output: sample is a json, save sample to a json file and return True
    assert isinstance(save_path, str)
    # generate sentence and eligible entities
    lst_text = pseg.lcut(text)
    sentence = list()
    lst_entity = list()
    for i, j in lst_text:
        if ('n' in j) or (j in ['i', 'j', 's', 'l']):  # 名词,成语、习语、空间词、临时语,也包含未知词"un"
            lst_entity.append(i)
        sentence.append(i)
    sentence = " ".join(sentence)
    lst_entity = list(set(lst_entity))
    # generate sample with json structure
    sample = list()
    for head, tail in itertools.combinations(lst_entity, 2):  # 候选词两两组合
        d = {
            "sentence": sentence,
            "head": {"word": str(head), "id": str(head)},
            "tail": {"word": str(tail), "id": str(tail)},
            "relation": ""}
        sample.append(d)
        # 对称
        d = {
            "sentence": sentence,
            "head": {"word": str(tail), "id": str(tail)},
            "tail": {"word": str(head), "id": str(head)},
            "relation": ""}
        sample.append(d)
    # save sample
    with open(save_path, "w") as f:
        json.dump(sample, f)
    return True
Beispiel #35
0
 def abstract_question(self, question):
     """
     使用jieba进行分词,将关键词进行词性抽象
     :param question:
     :RETURN:
     """
     self.abstractMap = {}
     list_word = pseg.lcut(question)  # 中文分词
     abstractQuery = ''
     nr_count = 0
     for item in list_word:
         word = item.word
         pos = str(item)
         if 'nm' in pos:  # 电影名
             abstractQuery += "nm "
             self.abstractMap['nm'] = word
         elif 'nr' in pos and nr_count == 0:
             abstractQuery += 'nnt '
             self.abstractMap['nnt'] = word
             nr_count += 1
         elif 'nr' in pos and nr_count == 1:  # nr再一次出现,改成nnr
             abstractQuery += "nnr "
             self.abstractMap['nnr'] = word
             nr_count += 1
         elif 'x' in pos:
             abstractQuery += "x "
             self.abstractMap['x'] = word
         else:
             abstractQuery += word + " "
     return abstractQuery
Beispiel #36
0
def get_question_by_rowQ1(rowQ):
    # print(rowQ)
    # print(rowQ)
    line = p.sub("",rowQ[-1]).replace("\n","")
    # line = rowQ[-2] + p.sub("",line).replace("\n","")
    pos_list = posseg.lcut(line)
    question = ""
    if(rule_1(line,pos_list,rowQ) != None):
        question = rule_1(line,pos_list,rowQ)
    elif(rule_2(line,pos_list,rowQ) != None):
        question = rule_2(line,pos_list,rowQ)
    else:
        # question = line
        # print(rowQ)
        # print("x")
        question = rule_3(line,pos_list,rowQ)

    question = question.lower()
    if(question.find(rowQ[0].lower()) == -1):
        question = rowQ[0].lower()+" "+question

    # print(question)
    # print("~~~"*30)


    return question
Beispiel #37
0
def pseg_adj_n(row):
    pair_pseg = [item for item in pseg.lcut(row[3])]

    adj = [[word, flag][0] for word, flag in pair_pseg if flag[0] == "a"]
    n = [[word, flag][0] for word, flag in pair_pseg if flag[0] == "n"]

    dataset = [adj, n]
    return dataset
def delstopwords(content):
    result = ''
    words = pseg.lcut("".join(content.split()))
    for word, flag in words:
        if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz",
                                                  "/y"]:  # 去停用词和其他词性,比如非名词动词等
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    return result
Beispiel #39
0
def readfile(filename):
    jieba.load_userdict('../dict.txt') # 导入自定义词典  tips:更改词频
    file = open(filename)
    content = file.readline()
    parts = content.split(' ')
    res = pseg.lcut(parts[2]) # 返回列表形式
    # for i in range(1, len(res)):
    #     print (res[i]).word
    for word, flag in res:
        if flag == 'nr':
            print word
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
    """

    :param raw_str: str
    :return: list[(str, str)]
    """
    res = [(a, b) for a, b in pseg.lcut(raw_str)]

    if filter_invalid_word_flag:
        return filter_invalid_word(res)
    else:
        return res
Beispiel #41
0
def ranking_function(output_prob_tree, cx, cy):
    # 平仄
    x_py = pypinyin.pinyin(cx, style=pypinyin.TONE2)
    y_py = pypinyin.pinyin(cy, style=pypinyin.TONE2)
    x_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, x_py)
    y_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, y_py)
    pingze_score = sum(map(lambda i, j: i + j == 0, x_pz, y_pz)) / float(len(cx)) + 0.001

    def sigmoid(x):
        return 1 / (1 + math.e ** (-x))

    def pos_eq(x_pos, y_pos):
        return x_pos == y_pos or x_pos in y_pos or y_pos in x_pos

    import operator
    smooth_value = 0.001
    freq_amp = 10 ** math.sqrt(len(cx))

    # 词性
    cx_pos = map(lambda x: zip(*pseg.lcut(x)[0])[0][1], cx)
    cy_pos = map(lambda y: zip(*pseg.lcut(y)[0])[0][1], cy)
    pos_score = reduce(operator.add, map(lambda x, y: float(1)/len(cx) if pos_eq(x, y) else 0, cx_pos, cy_pos))
    pos_score += smooth_value

    # 输出概率
    out_score = reduce(operator.mul, map(lambda x, y: output_prob_tree[x][y] * freq_amp, cx, cy))
    out_score = sigmoid(out_score)
    out_score += smooth_value

    # 整合
    score = pingze_score * out_score * pos_score
    # score = pingze_score * pos_score

    # print 'ranking', cy
    # print 'pingze', pingze_score
    # print 'pos', pos_score
    # print 'freq', out_score

    return score
def delstopwords(content):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词

    words = pseg.lcut(content)
    for word, flag in words:
        if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
            # print result
    return result
Beispiel #43
0
    def on_post(self, req, resp):
        body = req.stream.read()
        if not body:
            raise falcon.HTTPBadRequest("Empty request body")

        # seg_list = list(jieba.cut(body, cut_all=False))
        words = pseg.lcut(body)
        result = list()
        for word, flag in words:
            tmp = posSeg.posSeg(word, flag)
            result.append(tmp.__dict__)
        resp.body = json.dumps(result)
        resp.status = falcon.HTTP_200
    def sentence_to_vector(sentence, padding=True, padding_length=10):
        '''
        将句子的每个词转换成词向量,
        如果出现word2vec 的OOV词,则随机初始化;
        如果padding 为True的话,会将句子进行补全,不够长度则补0,超出长度则截断
        :param sentence: input sentence
        :type sentence: list
        :param padding: bool
        :param padding_length:
        :type padding_length: int
        :return:
        :rtype:np.array
        '''
        vectors = []
        for item in sentence:
            # print item
            try:
                vector = word2vec_model[unicode(item)]
            except:
                logging.debug(u'出现未知词(%s),随机填充' % (item))
                vector = get_unkown_vector(50)
            # print vector
            vectors.append(vector)

        if padding:
            if len(vectors) > padding_length:
                logging.debug(u'对句子进行截断:%s' % (' '.join(sentence)))
                seg_index = range(len(vectors))
                # print seg_index
                # 对代词进行过滤
                counter = 0
                for item in posseg.lcut(' '.join(sentence)):
                    # print counter

                    if item.flag == 'x':
                        # seg_index.remove(index)
                        continue
                    if item.flag == 'r':
                        logging.debug('去除:%s' % (item))
                        seg_index.remove(counter)
                    if len(seg_index) == padding_length:
                        break
                    counter += 1
                vectors = np.asarray(vectors)[seg_index][:padding_length]

                sentence = np.asarray(sentence)[seg_index][:padding_length]
                logging.debug(u'对句子进行截断后:%s' % (' '.join(sentence)))
            elif len(vectors) < padding_length:
                vectors.extend([get_None_vector(50)] * (padding_length - len(vectors)))

        return np.asarray(vectors)
Beispiel #45
0
def delNOTNeedWords(content,stopwords):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词

    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag[0] in [u'n',u'f',u'a',u'z']): #去停用词和其他词性,比如非名词动词等
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
    return result
Beispiel #46
0
async def _(session: NLPSession):
    # 去掉消息首尾的空白符
    stripped_msg_text = session.msg_text.strip()
    # 对消息进行分词和词性标注
    words = posseg.lcut(stripped_msg_text)

    city = None
    # 遍历 posseg.lcut 返回的列表
    for word in words:
        # 每个元素是一个 pair 对象,包含 word 和 flag 两个属性,分别表示词和词性
        if word.flag == 'ns':
            # ns 词性表示地名
            city = word.word

    # 返回处理结果,三个参数分别为置信度、命令名、命令会话的参数
    return NLPResult(90.0, 'weather', {'city': city})
Beispiel #47
0
def delNOTNeedWords(content,stopwords):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词

    words = pseg.lcut(content)
    # jieba.cut()
    text_list = []
    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等
            # text_list.append(word.encode('utf-8'))
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
        # ''.join(text_list)
    return result
Beispiel #48
0
def seg(sentence,sep='|',full_mode = True,remove_stopword = False):
    '''
    使用jieba分词进行分词
    :param sentence: 待分词句子
    :type sentence: str
    :param remove_stopword: 是否去除stopword
    :type remove_stopword: bool
    :return:返回分词后字符串,seg_srt
    :rtype: str
    '''
    # logging.debug('是否去除stopwords:%s'%remove_stopword)
    # for items in jseg.lcut(sentence):
    #     print items.flag,items.word

    seg = []
    pattern = re.compile('[0-9]+$')
    for items in jseg.lcut(sentence):
        # 利用词性标注去除标点符号
        if items.flag == 'x':
            logging.debug(u'句子(%s)将标点符号:"%s"替换成""'%(sentence,items.word))
            seg.append('')
            # continue
        if remove_stopword and items.word in stopword_list:
            logging.debug(u'句子(%s)去除stopwords:%s' % (sentence,items))
            continue
        # 将数字替换成 NUM
        if pattern.match(items.word) and items.word not in exclude_word_list:
            print items
            seg.append('DIGITTAG')
            logging.debug(u'句子(%s)将数字:"%s" 替换成标记:"DIGITTAG"'%(sentence,items.word))
        else:
            seg.append(items.word)
    # sentence = [items.word for items in jseg.lcut(sentence) if items.flag!='x']


    sentence = ' '.join(seg)
    # print sentence
    # print sentence
    seg_list = jieba.lcut(sentence, cut_all=full_mode)
    # print seg_list
    seg_list = [item for item in seg_list if len(item.strip())!=0]
    # print seg_list
    seg_srt = sep.join(seg_list)
    return seg_srt
Beispiel #49
0
def delstopwords(content):
    stopwords = codecs.open('stopwords.txt', encoding='UTF-8').read()
    stopwordSet = set(stopwords.split('\n'))
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词

# v 动词
#
# vd 副动词
#
# vn 名动词
#
# vshi 动词“是”
#
# vyou 动词“有”
#
# vf 趋向动词
#
# vx 形式动词
#
# vi 不及物动词(内动词)
#
# vl 动词性惯用语
#
# vg 动词性语素


    words = pseg.lcut(content)
    for word, flag in words:
        if (word not in stopwordSet and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y","/v","/vd","/vn","/vshi","/vyou","/v","/vf","/vx","/vi","/vl","/vg"]): #去停用词和其他词性,比如非名词动词等
        # if (word not in stopwords and flag in ["/n","/a","/d"]): #去停用词和其他词性,比如非名词动词等

            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #去停用词
            # print result
    return result
Beispiel #50
0
    def parse_by_rules(self, text):
        self.words = pseg.lcut(parse_cn_number(text), HMM=False)
        while self.has_next():
            beginning = self.get_index()

            self.consume_repeat()

            self.consume_year_period() \
                or self.consume_month_period() \
                or self.consume_day_period()

            self.consume_weekday_period() \
                or self.consume_hour_period() \
                or self.consume_minute_period() \
                or self.consume_second_period()

            self.consume_year() \
                or self.consume_month() \
                or self.consume_day()

            self.consume_hour()

            if self.get_index() != beginning:
                # Time found
                self.consume_word(u'准时')
                if self.consume_word(u'提醒'):
                    self.consume_word(u'我')
                if self.current_tag() == 'v' and self.peek_next_word() == u'我':
                    self.advance(2)
                self.consume_to_end()
                # Donot set event to None,since serializer will just skip None and we will have no chance to modify it
                remind = Remind(time=self.now, repeat=self.repeat, desc=text, event=self.do_what)
                remind.reschedule()
                return remind
            else:
                self.advance()
        return None
Beispiel #51
0
    def seg(self,
            sentence,
            sep=' ',
            full_mode=False,
            remove_stopword=False,
            replace_number=False,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            HMM=False,
            ):
        """
            使用 jieba 分词进行分词

        :param sentence: 待分词句子
        :type sentence: str
        :param sep: 将句子分完词之后使用什么字符连接,默认以空格连接.
        :type sep: str
        :param full_mode: jieba设置选项,是否使用full mode分词模式.
        :type full_mode: bool
        :param remove_stopword: 是否去除 stop word
        :type remove_stopword: bool
        :param replace_number: 是否把数字统一替换成字符 NUM
        :type replace_number: bool
        :param lowercase: 是否把字母转成小写
        :type lowercase: bool
        :param zhs2zht: 出現繁体的時候,是否转简体
        :type zhs2zht: bool
        :param remove_url: 是否移除 微博url,包含t.cn的url,比如:http://t.cn/开头的地址或者//t.cn/R50TdMg
        :type remove_url: bool
        :param HMM: 是否启用HMM发现新词模式,默认为False
        :type HMM: bool
        :return: 返回分词后字符串,seg_srt
        :rtype: str

        """
        # 先去除所有空格
        sentence = sentence.replace(' ', '')

        if lowercase:
            # 转成小写
            sentence = sentence.lower()
        if zhs2zht:
            # 繁体转简体
            sentence = self.convert_to_simple_chinese(sentence)
        if remove_url:
            # sentence = re.sub(u'(http:)//t.cn/[a-zA-Z0-9]*$', '', sentence)
            sentence = re.sub(u'(http:|)//t.cn/[a-zA-Z0-9]+', '', sentence)

        # 数字对模式匹配
        num_pattern = re.compile('[0-9][0-9\.]*$')
        words = []
        for item in jieba.lcut(sentence, HMM=False):
            if num_pattern.match(item):
                # 匹配上数字
                if not replace_number:
                    words.append(item)
                elif item not in self.exclude_word_list:
                    word = num_pattern.sub('NUMBER', item)
                    words.append(word)
                    if self.verbose > 1:
                        logging.debug(u'句子(%s)将数字:"%s" 替换成标记:"NUMBER"' % (sentence, item))
                        print(u'句子(%s)将数字:"%s" 替换成标记:"NUMBER"' % (sentence, item))
                else:
                    words.append(item)

            elif remove_stopword and item in self.stopword_list:
                # 移除 stop words
                if self.verbose > 1:
                    logging.debug(u'句子(%s)去除stopwords:%s' % (sentence, item))
            else:
                # 其他词如果词性是 x, 则识别到标点符号
                is_x = False
                for word, pos in jseg.lcut(item, HMM=HMM):
                    # print word,pos
                    if pos in ['x']:
                        is_x = True
                        # words.append(word)

                if is_x:
                    # 标点符号
                    # print item
                    if self.verbose > 1:
                        logging.debug(u'句子(%s)将标点符号:"%s"替换成""' % (sentence, ''))
                else:
                    words.append(item)

        sentence = ' '.join(words)
        # print sentence
        # print sentence
        seg_list = jieba.lcut(sentence, cut_all=full_mode)
        # print seg_list
        seg_list = [item for item in seg_list if len(item.strip()) != 0]
        # print seg_list
        seg_srt = sep.join(seg_list)

        return seg_srt
Beispiel #52
0
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-"*40)

# quit()
jieba.add_word('石墨烯')
seg_list = jieba.cut(p, cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut(p, cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut(p)  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search(p)  # 搜索引擎模式
print(", ".join(seg_list))
print jieba.suggest_freq(('好','我'))
print jieba.suggest_freq(('走','了'))


print ','.join(jieba.lcut(p))
print ','.join(jieba.lcut_for_search(p))

print ','.join(['%s/%s'%(i,j) for i,j in pseg.lcut(p)])
Beispiel #53
0
def extract(input_string):
	sentences = input_string.split(" ")
	seg_list = [ item for sentence in sentences for item in pseg.lcut(sentence) ]
	for word, flag in seg_list:
		print word, flag
	return [[word,flag] for word,flag in seg_list]
    def __analyse_clause(self, the_clause, runout_filepath, print_show):
        sub_clause = {"score": 0, "positive": [], "negative": [], "conjunction": [], "punctuation": [], "pattern": []}
        seg_result = posseg.lcut(the_clause)

        # 将分句及分词结果写进运行输出文件,以便复查
        if runout_filepath is not None:
            self.__write_runout_file(runout_filepath, the_clause + '\n')
            self.__write_runout_file(runout_filepath, str(seg_result) + '\n')
        if print_show:
            print(the_clause)
            print(seg_result)

        # 判断句式:如果……就好了
        judgement = self.__is_clause_pattern2(the_clause)
        if judgement != "":
            sub_clause["pattern"].append(judgement)
            sub_clause["score"] -= judgement["value"]
            return sub_clause

        # 判断句式:是…不是…
        judgement = self.__is_clause_pattern1(the_clause)
        if judgement != "":
            sub_clause["pattern"].append(judgement)
            sub_clause["score"] -= judgement["value"]

        # 判断句式:短语
        judgement = self.__is_clause_pattern3(the_clause, seg_result)
        if judgement != "":
            sub_clause["score"] += judgement["score"]
            if judgement["score"] >= 0:
                sub_clause["positive"].append(judgement)
            elif judgement["score"] < 0:
                sub_clause["negative"].append(judgement)
            match_result = judgement["key"].split(":")[-1]
            i = 0
            while i < len(seg_result):
                if seg_result[i].word in match_result:
                    if i + 1 == len(seg_result) or seg_result[i + 1].word in match_result:
                        del (seg_result[i])
                        continue
                i += 1

        # 逐个分析分词
        for i in range(len(seg_result)):
            mark, result = self.__analyse_word(seg_result[i].word, seg_result, i)
            if mark == 0:
                continue
            elif mark == 1:
                sub_clause["conjunction"].append(result)
            elif mark == 2:
                sub_clause["punctuation"].append(result)
            elif mark == 3:
                sub_clause["positive"].append(result)
                sub_clause["score"] += result["score"]
            elif mark == 4:
                sub_clause["negative"].append(result)
                sub_clause["score"] -= result["score"]

        # 综合连词的情感值
        for a_conjunction in sub_clause["conjunction"]:
            sub_clause["score"] *= a_conjunction["value"]

        # 综合标点符号的情感值
        for a_punctuation in sub_clause["punctuation"]:
            sub_clause["score"] *= a_punctuation["value"]

        return sub_clause
Beispiel #55
0
def pos_seg(word):
    # for items in jseg.lcut(word):
    #     print items.flag,items.word

    return jseg.lcut(word)[0].flag