Ejemplo n.º 1
0
async def releaseFollower(bot, userQQ, userGroup, msg):
    # Judge whether there are followers
    follow = await Utils3.getInformationOfUserSFollowerModule(userQQ)
    if follow['number'] == 0:
        return error
    # Check whether it conforms to the format
    if msg.find('放生') != -1:
        # Remove fixed fields
        msg = msg.replace('放生', '').strip()
        # Participle of original news
        listOfOriginalWordSegmentation = [msg]
        listOfOriginalWordSegmentation += jieba.lcut(msg)
        # Add follower name to Library
        for l in listOfOriginalWordSegmentation:
            for f in follow['follower_information']:
                # perfect match
                if f['name'] == l:
                    await distributionMessage.releaseFollowerDistributor(
                        bot, userQQ, userGroup, f['qq'])
                    return ok
                # Word segmentation
                listOfFollowerNames = jieba.lcut(f['name'])
                for l_ in listOfFollowerNames:
                    if l_ == l:
                        await distributionMessage.releaseFollowerDistributor(
                            bot, userQQ, userGroup, f['qq'])
                        return ok
Ejemplo n.º 2
0
    def save2label(self):
        custlist = []
        unique_query_list = []
        for q in self.cust_list:
            query_list = q.split(" ")
            #tokenising part need to be reviewed
            for query in query_list:
                if query.strip() != "" and query.strip(
                ) not in unique_query_list:
                    sent = query.strip()
                    tokenised_sent_list = []
                    #case of sent sticking together
                    if self.hasLink(sent) or self.hasImage(sent):
                        imageless_list = self.tokeniser(
                            self.pattern_image, sent)
                        for msg in imageless_list:
                            if re.search(self.pattern_image, msg):
                                tokenised_sent_list.append(msg)
                            else:
                                linkless_list = self.tokeniser(
                                    self.pattern_link, msg)
                                for word in linkless_list:
                                    if not re.search(self.pattern_link, word):
                                        split_word = jieba.lcut(word)
                                        #not to remove decimal point from numbers
                                        #typo of jiao(teach) & jiao(hand in)
                                        split_word_list = self.preprocess(
                                            split_word)

                                        for splitword in split_word_list:
                                            tokenised_sent_list.append(
                                                splitword)
                                    else:
                                        tokenised_sent_list.append(word)
                    else:
                        tokenised_sent_list = jieba.lcut(sent)
                        tokenised_sent_list = self.preprocess(
                            tokenised_sent_list)
                    #empty sentence cases
                    tokenised_sent = " ".join(tokenised_sent_list)
                    if tokenised_sent != "":
                        sent_intent = self.label_intent(sent)
                        sent_slots = self.label_slot(tokenised_sent_list)
                        labelled_sent = [
                            tokenised_sent, sent_intent, sent_slots
                        ]
                        custlist.append(labelled_sent)

                unique_query_list.append(query.strip())

        # save labelled cleaned data
        new_df = pd.DataFrame(data=custlist)
        new_df.to_csv(self.labelpath, index=False, encoding="utf-8")

        return custlist
Ejemplo n.º 3
0
 def _segment(self, line, with_tag=True):
     self._reset()
     new_line = line
     for r, ta in default_regex:
         new_line = r.sub(self._sub_fn(ta, with_tag), new_line)
     matched = re.finditer(r'ph_[\s\S]+?_(?P<num>\d{1,2})_', new_line)
     start = 0
     tokens = []
     for m in matched:
         tokens += jieba.lcut(new_line[start: m.start()])
         tokens += [self.collector[int(m.group('num')) - 1]]
         start = m.end()
     tokens += jieba.lcut(new_line[start:])
     return tokens
Ejemplo n.º 4
0
def convert_sentence_bio(sentence):

    word_list, label_list = [], []

    for c in jieba.lcut(sentence.strip('\n'), cut_all=False, HMM=False):
        if c not in ners.word.values:
            c_tag = 'O'
            word_tag = list(zip(list(c), [c_tag] * len(c)))
        else:
            c_ner = ners[ners['word'] == c]['ner'].values[0]
            if len(c) == 1:
                c_tag = ['I' + '-' + str(c_ner)]
            else:
                c_tag_head = 'B' + '-' + str(c_ner)
                c_tag_middle = ['I' + '-' + str(c_ner)] * (len(c) - 2)
                c_tag_tail = 'I' + '-' + str(c_ner)
                c_tag = [c_tag_head] + c_tag_middle + [c_tag_tail]
            word_tag = list(zip(list(c), c_tag))

        word_list += [i[0] for i in word_tag]
        label_list += [i[1] for i in word_tag]

    assert len(word_list) == len(label_list)

    record = list(zip(word_list, label_list))
    return record
Ejemplo n.º 5
0
def segment_line(line, char_level):
    line = CC.convert(REGEX.sub(' ', line))
    if char_level:
        segmented = tools.sen2chars(line)
    else:
        segmented = jieba.lcut(line)
    return list(filter(lambda x: x.strip(), segmented))
Ejemplo n.º 6
0
def segment_sen(sen, char_level):
    sen = CC.convert(remove_pattern.sub(' ', sen))
    if char_level:
        segmented = tools.sen2chars(sen)
    else:
        segmented = jieba.lcut(sen)
    return list(filter(lambda x: x.strip(), segmented))
Ejemplo n.º 7
0
    def threshold_value(self, sentence):
        """
        :param sentence: 待输入的新闻
        :return:列表各项分别代表积极/消极,公司名,公司股票代码,给出的概率
        """
        com_name = self.get_name(sentence)
        com_code = self.search_code(com_name)

        li = jieba.lcut(sentence, cut_all=True)

        p_threshold_value = 0
        n_threshold_value = 0

        for item in li:
            for key, value in self.positive_dict.items():
                if item == key:
                    p_threshold_value += value[4]
            for key, value in self.negative_dict.items():
                if item == key:
                    n_threshold_value += value[4]

        # print(com_name, com_code, p_threshold_value, n_threshold_value)

        if (p_threshold_value != 0 or n_threshold_value != 0) and com_name != ' ' and com_code != ' ' \
                and max(p_threshold_value, n_threshold_value) < 1.0:
            if p_threshold_value >= n_threshold_value:
                return ['positive', com_name, com_code, p_threshold_value]
            else:
                return ['negative', com_name, com_code, n_threshold_value]
        else:
            return []
def cleanReviewChinese(content):
    """
    中文文本预处理函数:去除各种标点符号,HTML标签,小写化
    :param content:
    :return:
    """
    # 去除HTML标签
    beau = BeautifulSoup(content, features="lxml")
    btext = beau.get_text()

    # 去除标点,数字, 字母及特殊字符
    sub_text = re.sub(
        '[a-zA-Z0-9'
        '1234567890'
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        'esngrikdchatvxyzolmpfu'
        '!;:.·、…【】《》“”‘’!?"#$%&%\'?@,。〔〕[]()()*+,-——\\./:;<=>+×/'
        '①↑↓★▌▲●℃[\\]^_`{|}~\s]+', "", btext)
    # 去除不可见字符
    newContent = re.sub(
        '[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+',
        '', sub_text)
    # 以空格划分单词
    words = jieba.lcut(newContent, cut_all=False)
    # 去除停止词
    with open("../data/stopwords.txt", encoding="utf-8-sig") as f:
        stopwords = [line.strip() for line in f.readlines()]
    filter_words = [word for word in words if word not in stopwords]
    res = " ".join(filter_words)
    return res
Ejemplo n.º 9
0
async def weather(session: CommandSession):
    ans = "NULL"
    # 从会话状态(session.state)中获取城市名称(city),如果当前不存在,则询问用户
    city = lcut(session.get('city', prompt='亲亲您想查询哪个城市的天气呢?'))[-1]
    logger.debug(city)
    # 异步获取 获取城市的天气预报
    city_weather = await get_weather_of_city(city)

    # 解析完成后搞事情
    status = city_weather['status']
    if status == 'ok':
        basic = city_weather['basic']
        now = city_weather['now']
        # 这里判断是否是城市
        # 这样子写乱了一点 因为有可能会有别的配置模式 以后也好改
        if basic['parent_city'] == basic['location']:
            if basic['admin_area'] == basic['parent_city']:
                ans = f"{basic['cnty']}{city}现在的天气状况:\n"
            else:
                ans = f"{basic['cnty']}{basic['admin_area']}{city}现在的天气状况:\n"
        else:
            ans = f"{basic['cnty']}{basic['admin_area']}省{basic['parent_city']}市{city}现在的天气状况:\n"

        ans += f"天气:{now['cond_txt']}\n" \
               f"此时实际气温:{now['tmp']}℃ \t体感气温:{now['fl']}℃\n" \
               f"能见度:{now['vis']}\t湿度:{now['hum']}\n" \
               f"风向:{now['wind_dir']}方向\t风力:{now['wind_sc']}级"
    elif status == 'no more requests':
        ans = '亲亲这边已经到查询上限了呢, 请明天再来呢'
    elif status == 'unknown location':
        ans = '您好这边找不到这个地方呢亲亲'
    elif status == 'error':
        ans = '亲亲请求失败呢'
    # 向用户发送天气预报
    await session.send(ans)
Ejemplo n.º 10
0
def test(filename):
    if FLAGS.src_word_seg == 'word':
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    trg_vocab_dict, _ = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1
    #model.decoder_max_len = None

    #sources = ["你是誰","你是誰"]
    #targets = ["你是不是想人家","我是說你是我老婆"]
    df = pd.read_csv(filename)
    df = df.fillna('')
    sources = list(df["context"])
    targets = list(df["utterance"])
    scores = []
    for source, target in zip(sources, targets):
        if FLAGS.src_word_seg == 'word':
            source = (' ').join(jieba.lcut(source))
        elif FLAGS.src_word_seg == 'char':
            source = (' ').join([s for s in source])
        if FLAGS.trg_word_seg == 'word':
            target = (' ').join(jieba.lcut(target))
        elif FLAGS.trg_word_seg == 'char':
            target = (' ').join([t for t in target])
        src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source),
                                                    src_vocab_dict, False)
        trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target),
                                                    trg_vocab_dict, False)
        trg_len = len(trg_token_ids)
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(src_token_ids):
                bucket_id = i
                break
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(src_token_ids, [])]}, bucket_id)
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)[:trg_len]
        output = [o[0][t] for t, o in zip(trg_token_ids, output)]
        output = np.mean(output)
        scores.append(output)
    scores = np.mean(scores)
    return scores
Ejemplo n.º 11
0
def get_words(txt, kind="char", return_type="str", jieba=None):
    if kind == "word":
        result = jieba.lcut(txt)[:max_document_len]
        #result = list(filter(lambda x:len(x.strip())>0,result))
    elif kind == "char":
        result = [t for t in txt][:max_document_len]
    if return_type == "str":
        result = ' '.join(result)
    return result
Ejemplo n.º 12
0
 def get_string_id(self, line):
     lst = jieba.lcut(clean(line))
     output = []
     for word in lst:
         if word in self.word2id:
             output.append(self.word2id[word])
         else:
             output.extend(self.get_string_id_fmm(word))
     return output
Ejemplo n.º 13
0
async def quickSearchOfItemKeywords(msg, _model='sell_out'):
    p = './HolyGrailWar/Config/Goods/Goods.json'
    content = await Utils.readFileToJSON(p)
    if content == error:
        raise GrailExcept
    goodsList = content['goodslist']
    segmentationResult = jieba.lcut(msg)
    designatedPopUp = 0
    if _model == 'purchase' or _model == 'sell_out':
        # Find figures
        numberMatchResults = await getTheNumberOfPossiblePurchases(
            segmentationResult)
        for i in segmentationResult:
            for g in goodsList:
                # May pop up
                if 'eject' in g:
                    for i_ in segmentationResult:
                        if i_ in g['eject']:
                            designatedPopUp = 1
                            break
                if designatedPopUp == 1:
                    designatedPopUp = 0
                    continue
                if _model == 'purchase':
                    if 'notsale' in g:
                        if g['notsale'] == True:
                            break
                if g['name'] == str(i):
                    return [g['id'], int(numberMatchResults)]
                for a in g['abbreviation']:
                    if a == str(i):
                        return [g['id'], int(numberMatchResults)]
    elif _model == 'check_the_price':
        # Price checking mode
        for i in segmentationResult:
            for g in goodsList:
                # May pop up
                if 'eject' in g:
                    for i_ in segmentationResult:
                        if i_ in g['eject']:
                            designatedPopUp = 1
                            break
                if designatedPopUp == 1:
                    designatedPopUp = 0
                    continue
                if 'notsale' in g:
                    if g['notsale'] == True:
                        break
                if g['name'] == str(i):
                    return [g['name'], g['price']]
                for a in g['abbreviation']:
                    if a == str(i):
                        return [g['name'], g['price']]
    return error
Ejemplo n.º 14
0
async def _(session: NLPSession):
    # 去掉消息首尾的空白符
    stripped_msg = session.msg_text.strip()
    logger.debug(stripped_msg)
    logger.debug(session.msg)
    # 对消息进行分词和词性标注
    words = lcut(stripped_msg)
    key_word = {'吗', '呢', '查询'}
    if any(map(lambda kw: kw in words, key_word)):
        return IntentCommand(90.0, ('zd'))
    return
Ejemplo n.º 15
0
async def parse_location(location_word: Union[str, List[str]]) -> Location:
    """
    Parse location like "江苏省常州市武进区".

    :param location_word: location word (segmented or not)
    :return: Location object
    """
    if not location_word:
        return Location()

    if isinstance(location_word, str):
        location_words = jieba_fast.lcut(location_word)
    else:
        location_words = location_word

    logger.debug(f'Parsing location: {location_words}')

    location = Location()
    i = 0
    while i < len(location_words):
        if all((location.province, location.city, location.district)):
            # we are done with "省"、"市"、"区/县级市"
            break

        w = location_words[i].strip('省市区县')
        if not w:
            i += 1
            continue

        result = await heweather.find(w)
        if not result or result.get('status') != 'ok':
            i += 1
            continue

        # status is ok here, so there is at lease one location info
        basic = result.get('basic')[0]
        parsed = False
        if w == basic.get('admin_area'):
            location.province = w
            parsed = True
        if w == basic.get('parent_city'):
            # don't check parsed here, because we may encounter "北京",
            # of which city and province are the same
            location.city = w
            parsed = True
        if not parsed and w == basic.get('location'):
            location.district = w

        i += 1  # head on to the next

    location.other = ''.join(location_words[i:]) or None
    return location
Ejemplo n.º 16
0
async def location_parse(word: Union[str, List[str]]) -> list:
    """
    构建用于api的地点字符串列表
    """
    if not word:
        return []
    if isinstance(word, str):
        loc = jieba_fast.lcut(word)
    else:
        loc = word

    location = []
    for i in loc:
        location.append(i.strip('省市区县'))
    return location
Ejemplo n.º 17
0
def read_file_seg(filename):
    contents = []
    text, labels = read_file(filename)
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")
    k = 1
    for one in text:
        word = []
        blocks = re_han.split(one)
        for blk in blocks:
            if re_han.match(blk):
                word.extend(jieba.lcut(blk))
        contents.append(word)
        if k / 100 == 0:
            print(k)
        k = k + 1
    return contents, labels
Ejemplo n.º 18
0
def LDA_topic(text):
    vector = []
    seg_list = jieba.lcut(text, cut_all=False)  # 使用分词,将文本分开 生成列表
    result = []
    for j in seg_list:  # 去掉停用词
        if j not in stopwords and j is not ' ':
            result.append(j)
    result = [result]
    dictionary = corpora.Dictionary(result)  # 构造词典,给每一个词创建一个索引号
    # 使用上面的词典,将转换文档列表(语料)变成 DT 矩阵
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in result]
    model = gensim.models.LdaModel.load('F:/代码code/DSTS/LDA/lda.model')  # 加载训练好的模型
    for e, values in enumerate(model.inference(doc_term_matrix)[0]):
        for ee, value in enumerate(values):
            vector.append(value)
    return vector
Ejemplo n.º 19
0
def get_threshold_value(sentence, positive_dict, negative_dict):
    li = jieba.lcut(sentence, cut_all=True)
    p_threshold_value = 0
    n_threshold_value = 0
    for item in li:
        for key, value in positive_dict.items():
            if item == key:
                p_threshold_value += value[4]
        for key, value in negative_dict.items():
            if item == key:
                n_threshold_value += value[4]
    if (p_threshold_value != 0 or n_threshold_value != 0) and max(
            p_threshold_value, n_threshold_value) < 1.0:
        if p_threshold_value >= n_threshold_value:
            return 'positive, ' + str(p_threshold_value)
        else:
            return 'negative, ' + str(n_threshold_value)
    else:
        return 'medium, 0'
Ejemplo n.º 20
0
def textSeg(sourefile0,targetfile0):
    targetfile = targetfile0 + '/sents.txt'
    f_w = open(targetfile, 'w+')
    k = 0
    for p in range(5):
        sourefile = sourefile0+'/part-0000'+str(p)
        f_r = open(sourefile,'r')
        for line in f_r:
            line = line.strip().lower()
            s = line.split('\t')
            if len(s)!=3:
                continue
            words = " ".join(jieba.lcut(s[2], HMM=True))
            f_w.write(words+'\n')
            k+=1
            if k%10000==0:
                print('write %d lines'%k)
        f_r.close()
        #os.remove(sourefile)
    f_w.close()
Ejemplo n.º 21
0
    def get_output(self, input_text):

        # 构建api接口返回数据
        return_dict = {
            'status': RET.OK,
            'response': 'success',
            'sentiment_res': {}
        }

        self.output["text"] = input_text
        self.jieba_words = jieba.lcut(input_text.replace(":",
                                                         ""))  # 获得 jieba 切词结果
        print("jieba_words: ", self.jieba_words)
        self.split_texts = re.split('[,。?,.;?: ]+', input_text.strip())
        print("split_texts: ", self.split_texts)

        # 文本的整体情感
        emotion = 999
        for word in self.normal_judge_words:  # 文本中性 特殊判断
            if word in input_text:
                emotion = 0
        if emotion != 0:
            emotion = self.get_baidu_sentiment(input_text)  # 百度api给出整体评价
        self.output["res"]["emotion"] = emotion

        self.get_general_comment(input_text)  # 获得 general 总评

        self.get_only_l1_comment()
        self.get_l1_l2_comment()
        self.get_11_l2_l3_comment()

        print(self.output)
        # self.output["res"]["matched_emotion"] = list(set(self.output["res"]["matched_emotion"]))
        return_dict["sentiment_res"] = {
            "text": self.output["text"],
            "seg_words": self.jieba_words,
            "res": self.output["res"]
        }

        return json.dumps(return_dict, ensure_ascii=False)
Ejemplo n.º 22
0
 def cut_text(self, text):
     return jieba.lcut(text)
Ejemplo n.º 23
0
def text_analysis(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        weibo_centent = f.read()
        weibo_dict = json.loads(weibo_centent)  # 将 JSON 对象转换为 Python 字典
        # 每一个JSON对象都是一个事件,转换成的列表里的每一个字典都是一条相关微博
        start_time = (weibo_dict[0])['t']
        ternimal_time = (weibo_dict[-1])['t']
        N = 10
        interval = ((ternimal_time - start_time) / N)
        feature_vector = []
        x = []  # 记录中间变量
        for i in range(0, len(weibo_dict)):
            item = weibo_dict[i]  # 从头开始遍历事件集
            time = item['t']  # 转发时间
            stamp = int((time - start_time) / interval)
            if stamp == N:  # 正好取到了端点值,将端点值归到最后一个区间
                stamp = (N - 1)  # 得到时间戳
            # #########################微博的内容特征
            text = item['text']  # 文本
            topic = LDA_topic(text)
            length = len(text)  # 文本长度
            hashtags_list = re.findall("#(.*)#", text)  # 提取标签,返回列表
            hashtags = len(hashtags_list)  # 返回标签数
            emoticons = re.findall("\[(.*?)\]", text)  # 提取表情,返回列表
            neg_emot_count = 0
            pos_emot_count = 0
            for emot in emoticons:  # 统计情感表情
                if emot in neg_emot:
                    neg_emot_count += 1
                elif emot in pos_emot:
                    pos_emot_count += 1
            url = text.count('http://')  # 统计出现网址个数
            at_mention = text.count('@')  # 统计 @ 行为
            exclamation_marks = text.count('!') + text.count('!')  # !数量
            question_marks = text.count('?') + text.count('?')  # ?数量
            if exclamation_marks == 0:
                question_exclamation = 0
            else:
                question_exclamation = question_marks / exclamation_marks  # 比值
            seg_list = jieba.lcut(text, cut_all=False)  # 使用分词,将文本分开 生成列表
            pos_word_count = 0  # 正面词数量
            neg_word_count = 0  # 负面词数量
            first_person_pronouns = 0  # 第一人称代词数,因为第一人称代词基本上是停用词,故应该在文本内查找
            for word in seg_list:
                if (word is '我' or word is '我们' or word is '俺' or word is '咱'
                        or word is '小生' or word is '吾' or word is '吾辈'
                        or word is '在下' or word is '老夫' or word is '余'
                        or word is '鄙人' or word is 'I' or word is 'me'
                        or word is 'we' or word is 'us' or word is 'We'):
                    first_person_pronouns += 1
            result = []
            for k in seg_list:  # 去掉停用词
                if k not in stopwords:
                    result.append(k)
            for word in result:  # 统计情感词汇数量
                if word in neg_dict:
                    neg_word_count += 1
                elif word in pos_dict:
                    pos_word_count += 1
            sentiment_score = pos_emot_count + pos_word_count - neg_emot_count - neg_word_count  # 情绪得分
            vector = [
                stamp, interval, length, sentiment_score, url, pos_emot_count,
                neg_emot_count, first_person_pronouns, hashtags, at_mention,
                question_marks, exclamation_marks, question_exclamation
            ] + topic

            x.append(vector)
        flag = 0  # 时间戳标记
        temp = x[0]  # 变量容器记录第一条
        count = 1  # 记录数量
        for weibo in x[1:]:  # 从第二个开始遍历
            if weibo[0] == flag:  # 同一个时间戳内
                temp[0] = flag
                temp[1] = interval
                for a in range(2, 31):
                    temp[a] += weibo[a]
                count += 1
            else:  # 时间戳改变
                for a in range(2, 31):
                    temp[a] = temp[a] / count
                feature_vector.append(temp)

                if weibo[0] != (flag + 1):  # 时间戳不连续,需要补0向量
                    for a in range(weibo[0] - flag - 1):
                        temp = [flag + 1] + [
                            interval
                        ] + [0] * 11 + [1 / 18] * 18  # 时间戳 区间间隔 和 11个0
                        flag += 1
                        if flag == (
                                N -
                                1):  # 如果(N-1)也需要补0向量,那么这一条不执行添加,因为最后必定会有一条不是0
                            break
                        feature_vector.append(temp)
                if flag == (N - 1):  # 如果时间戳改变成了(N-1),那么后面的所有微博都添加在这个时间戳内
                    # 改变时间戳标记,改变中间变量容器
                    temp = [0] * len(weibo)
                    count = 0
                    # 找到当前微博
                    p = x.index(weibo)
                    for thing in x[p:]:
                        temp[0] = flag
                        temp[1] = interval
                        for a in range(2, 31):
                            temp[a] += thing[a]
                        count += 1
                    break
                flag = weibo[0]  # 改变时间戳标记,改变中间变量容器
                temp = weibo
                count = 1
        for a in range(2, 31):  # 添加这个事件的最后一个时间戳到特征向量
            temp[a] = temp[a] / count
        feature_vector.append(temp)
    return feature_vector
Ejemplo n.º 24
0
 def word_segment(text):
     return jieba.lcut(text)
Ejemplo n.º 25
0
    def get_keywords(self, stopkey, topk=5):
        """
        get the top k keywords
        :param topk:
        :return:
        """
        jieba_add_words()
        if isinstance(self.data, list):
            key_list = []
            for text in self.data:
                jieba.analyse.set_stop_words("data/stopWord.txt")
                keywords = jieba.analyse.textrank(text,
                                                  topK=topk,
                                                  allowPOS=('n', 'nz', 'v',
                                                            'vd', 'vn', 'l',
                                                            'a', 'd'))
                key_list.append(keywords)
            return pd.DataFrame({
                "id": self.idList,
                " text": self.data,
                "keywords": key_list
            })
        if isinstance(self.data, str):
            doc = ' '.join(jieba.lcut(preprocess(self.data, )))

            n_gram_range = (1, 1)
            count = CountVectorizer(ngram_range=n_gram_range,
                                    stop_words=stopkey).fit([doc])
            candidates = count.get_feature_names()

            model = SentenceTransformer(
                r'xlm-r-distilroberta-base-paraphrase-v1')

            doc_embedding = model.encode([doc])
            candidate_embeddings = model.encode(candidates)

            # top_n = 15
            # distances = cosine_similarity(doc_embedding, candidate_embeddings)
            # keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
            mss_kws = self.max_sum_sim(
                doc_embedding=doc_embedding,
                word_embeddings=candidate_embeddings,
                candidate_embeddings=candidate_embeddings,
                candidates=candidates,
                words=candidates,
                top_n=topk,
                nr_candidates=20)
            return mss_kws


# def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
#     """
#     get Max Sum Similarity
#     :param doc_embedding:
#     :param word_embeddings:
#     :param words:
#     :param top_n:
#     :param nr_candidates:
#     :return:
#     """
#     # Calculate distances and extract keywords
#     distances = cosine_similarity(doc_embedding, candidate_embeddings)
#     distances_candidates = cosine_similarity(candidate_embeddings,
#                                             candidate_embeddings)
#
#     # Get top_n words as candidates based on cosine similarity
#     words_idx = list(distances.argsort()[0][-nr_candidates:])
#     words_vals = [candidates[index] for index in words_idx]
#     distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]
#
#     # Calculate the combination of words that are the least similar to each other
#     min_sim = np.inf
#     candidate = None
#     for combination in itertools.combinations(range(len(words_idx)), top_n):
#         sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
#         if sim < min_sim:
#             candidate = combination
#             min_sim = sim
#
#     return [words_vals[idx] for idx in candidate]

# mss_kws = max_sum_sim(doc_embedding=doc_embedding,
#             word_embeddings=candidate_embeddings,
#             words=candidates,
#             top_n=20,
#             nr_candidates=20)

# def mmr(doc_embedding, word_embeddings, words, top_n, diversity):
#     """
#      Maximal Marginal Relevance
#     :param doc_embedding:
#     :param word_embeddings:
#     :param words:
#     :param top_n:
#     :param diversity:
#     :return:
#     """
#     # Extract similarity within words, and between words and the document
#     word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
#     word_similarity = cosine_similarity(word_embeddings)
#
#     # Initialize candidates and already choose best keyword/keyphras
#     keywords_idx = [np.argmax(word_doc_similarity)]
#     candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
#
#     for _ in range(top_n - 1):
#         # Extract similarities within candidates and between candidates and selected keywords/phrases
#         candidate_similarities = word_doc_similarity[candidates_idx, :]
#         target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
#
#         # Calculate MMR
#         mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
#         mmr_idx = candidates_idx[np.argmax(mmr)]
#
#         # Update keywords & candidates
#         keywords_idx.append(mmr_idx)
#         candidates_idx.remove(mmr_idx)
#
#     return [words[idx] for idx in keywords_idx]

# mmr_kws = mmr(doc_embedding=doc_embedding,
#               word_embeddings=candidate_embeddings,
#               words=candidates,
#               top_n=20,
#               diversity=0.8)

# print(mss_kws)
# print(mmr_kws)
# print([s for s in mss_kws if s in mmr_kws])
Ejemplo n.º 26
0
def segment_full_mode(txt_string):
    """ 全模式:
    我是上海交通大学学生
        => ['我', '是', '上海', '上海交通大学', '交通', '大学', '学学', '学生'] """
    return jieba.lcut(txt_string, cut_all=True)
Ejemplo n.º 27
0
def segment_accurate_mode(txt_string):
    """ 精确模式:
    我是上海交通大学学生
        => ['我', '是', '上海交通大学', '学生'] """
    return jieba.lcut(txt_string)
Ejemplo n.º 28
0
 def word_tokenize(text):
     text = pretreatment(text)
     return jieba.lcut(text)
Ejemplo n.º 29
0
 def _word_split(sentence):
     return jieba.lcut(sentence)
Ejemplo n.º 30
0
 def jieba_tokenize(sent):
     return jieba.lcut(sent)