Esempio n. 1
0
async def thesaurusInitialization():
    # Dynamically load Thesaurus
    _p = './HolyGrailWar/Config/Goods/Goods.json'
    _content = await Utils.readFileToJSON(_p)
    for _g in _content['goodslist']:
        jieba.add_word(_g['name'])
        for _a in _g['abbreviation']:
            jieba.add_word(_a)
Esempio n. 2
0
 def add_words(self, yanwenzi_dict_list, freq=100, tag='ywz'):
     '''
     分词词典中加入新增的词
     yanwenzi_dict_list,dict/list都可以,新增到分词的词典之中
     '''
     if isinstance(yanwenzi_dict_list, dict):
         for k, v in yanwenzi_dict_list.items():
             jieba.add_word('_{}_'.format(v), freq=freq, tag=tag)
     elif isinstance(yanwenzi_dict_list, list):
         for word in yanwenzi_dict_list:
             jieba.add_word('_{}_'.format(word), freq=freq, tag=tag)
     else:
         raise Exception('yanwenzi_dict_list format must be dict or list.')
     print('jieba add words. length :', len(yanwenzi_dict_list))
Esempio n. 3
0
import sys
reload(sys)
sys.setdefaultencoding('utf8')
"""数据处理"""
'''
结巴分词模块
'''
import time
import jieba_fast
import jieba_fast.posseg as pseg
import sys
reload(sys)
sys.setdefaultencoding('utf8')

jieba_fast.load_userdict('/Users/zhuxinquan/Desktop/mykeyword.dict')
jieba_fast.add_word('烤鸭炉')
#存储停用词
fid2 = '/Users/zhuxinquan/Desktop/停用词调整_二手.txt'
stopword = {}
fid2 = open(fid2, 'r')
for j in fid2.readlines():
    stopword[j.strip().decode("utf-8")] = 1


def stop_word(line):
    data_line = line.strip()
    wordList = jieba_fast.cut(data_line)  # wordlist是一个生成器
    outStr = ''
    t1 = time.time()
    for word in wordList:
        if word not in stopword:
Esempio n. 4
0
except Exception as e:
    import jieba

try:
    print(model_dir)
    test_size = 0.025
    online = True
except:
    model_dir = "pai_model/"
    test_size = 0.05
    online = False

new_words = "支付宝 付款码 二维码 收钱码 转账 退款 退钱 余额宝 运费险 还钱 还款 花呗 借呗 蚂蚁花呗 蚂蚁借呗 蚂蚁森林 小黄车 飞猪 微客 宝卡 芝麻信用 亲密付 淘票票 饿了么 摩拜 滴滴 滴滴出行".split(
    " ")
for word in new_words:
    jieba.add_word(word)

star = re.compile("\*+")
if False:
    stops = [
        "、", "。", "〈", "〉", "《", "》", "一", "一切", "一则", "一方面", "一旦", "一来", "一样",
        "一般", "七", "万一", "三", "上下", "不仅", "不但", "不光", "不单", "不只", "不如", "不怕",
        "不惟", "不成", "不拘", "不比", "不然", "不特", "不独", "不管", "不论", "不过", "不问", "与",
        "与其", "与否", "与此同时", "且", "两者", "个", "临", "为", "为了", "为什么", "为何", "为着",
        "乃", "乃至", "么", "之", "之一", "之所以", "之类", "乌乎", "乎", "乘", "九", "也", "也好",
        "也罢", "了", "二", "于", "于是", "于是乎", "云云", "五", "人家", "什么", "什么样", "从",
        "从而", "他", "他人", "他们", "以", "以便", "以免", "以及", "以至", "以至于", "以致", "们",
        "任", "任何", "任凭", "似的", "但", "但是", "何", "何况", "何处", "何时", "作为", "你",
        "你们", "使得", "例如", "依", "依照", "俺", "俺们", "倘", "倘使", "倘或", "倘然", "倘若",
        "借", "假使", "假如", "假若", "像", "八", "六", "兮", "关于", "其", "其一", "其中", "其二",
        "其他", "其余", "其它", "其次", "具体地说", "具体说来", "再者", "再说", "冒", "冲", "况且",
Esempio n. 5
0
 def add_jieba(wds, tag):
     for wd in wds:
         jieba.add_word(wd, tag=tag, freq=3000000)
Esempio n. 6
0
# -*- coding: utf-8 -*-
"""
@Time : 2020/3/8 11:06
@Auth : joleo
@File :bm_recall.py
"""
# https://blog.csdn.net/byn12345/article/details/81112973
from gensim import corpora, models, similarities
from load_data import read_test, read_context, read_train
from collections import defaultdict
import jieba_fast as jieba
from recall.rank_bm25 import BM25Okapi
import pandas as pd

# jieba.load_userdict(file_name='')
jieba.add_word('复工')
jieba.add_word('稳岗')
jieba.add_word('医保局')
jieba.add_word('暖企')
# jieba.del_word('医保局')

paper_data = read_context(
    'data/NCPPolicies_context_20200301/NCPPolicies_context_20200301.csv')
train_data = read_train(
    './data/NCPPolicies_train_20200301/NCPPolicies_train_20200301.csv')
valid_data = read_test('./data/NCPPolicies_test/NCPPolicies_test.csv')

train_data['question'] = train_data['question'].map(
    lambda x: x.replace('\n', ''))
train_data['answer'] = train_data['answer'].map(lambda x: x.replace('\n', ''))
valid_data['question'] = valid_data['question'].map(
Esempio n. 7
0
mode = 0
char_size = 128
maxlen = 256
min_count = 16

word2vec = Word2Vec.load('../word2vec_baike/word2vec_baike')

id2word = {i + 1: j for i, j in enumerate(word2vec.wv.index2word)}
word2id = {j: i for i, j in id2word.items()}
word2vec = word2vec.wv.syn0
word_size = word2vec.shape[1]
word2vec = np.concatenate([np.zeros((1, word_size)), word2vec])

for w in word2id:
    if w not in jieba.dt.FREQ:
        jieba.add_word(w)


def tokenize(s):
    return jieba.lcut(s, HMM=False)


def sent2vec(S):
    """S格式:[[w1, w2]]
    """
    V = []
    for s in S:
        V.append([])
        for w in s:
            for _ in w:
                V[-1].append(word2id.get(w, 0))
Esempio n. 8
0
def Autotag_gen(sentences_list):
    global existing_ner_df
    global jieba_dict_df

    OOVs_df = OOV_scan_run(sentences_list)
    #print(OOVs_df)
    #print('\n\n')
    #### Post-processing for OOV:

    ###  OOV post-processing 1: adding OOVs to existing OOV data:
    existing_ner_df = pd.concat([existing_ner_df, OOVs_df],
                                axis=0,
                                ignore_index=True)

    ###  OOV post-processing 2: adding OOVs to jieba data:
    for i in OOVs_df.itertuples():
        OOV = str(i.word)
        OOV_nertag = str(i.ner)

        OOV_postag = ner2pos[OOV_nertag]
        jieba_freq = jieba.suggest_freq(OOV, tune=False)

        jieba.add_word(OOV, jieba_freq, tag=OOV_postag)
        jieba_dict_df = jieba_dict_df.append(
            pd.DataFrame([[OOV, jieba_freq, OOV_postag]],
                         columns=jieba_dict_df.columns))

    del OOVs_df
    #### Run Hash-tag generation

    # #### Hash-tag generation step 1: cleaning raw text + split into sentences
    # preprocessing_sentences =text_preprocessing(raw_sentences)
    # preprocessing_sentences_list= DummySentencizer(preprocessing_sentences, split_characters=['。」' ,'。' ,';' ,'!' ,'*']).sentences
    # preprocessing_sentences_list=[s for s in preprocessing_sentences_list if len(s)>1]

    #### Hash-tag generation step 2: tokenize the pre-processing sentences

    #### Hash-tag generation step 2.1 : apply jieba to tokenize the pre-processing sentences
    tokens = []
    for preprocessing_sentence in sentences_list:
        tokens += [
            (t, pos)
            for t, pos, in jieba_posseg.cut(preprocessing_sentence, HMM=False)
            if pos in ['nr', 'nt', 'ns', 'n', 'nz', 'x'] and len(t) > 1
        ]
        tokens = list(set(tokens))
        tokens = sorted(tokens, key=lambda x: len(x[0]), reverse=False)

#### Hash-tag generation step 2.2 : clean tokens
    tokens_clean = []
    for index, (token, pos) in enumerate(tokens):
        sub_tokens = tokens[index + 1:]
        Sum_check = 0
        for sub_t, sub_pos in sub_tokens:
            if token in sub_t:
                Sum_check += 1
            else:
                Sum_check += 0

        if Sum_check == 0:
            tokens_clean.append((token, pos))
    del tokens
    # duplicate_check=pd.DataFrame(tokens_duplicate_check,columns=['token','pos','Sum_check'])
    # duplicate_check.sort_values('Sum_check',ascending=True,inplace=True)
    # duplicate_check.drop_duplicates('token',inplace=True)
    # tokens_clean= list(duplicate_check.to_records(index=False))

    tokens_clean = sorted(tokens_clean, key=lambda x: len(x[0]), reverse=True)

    #### Hash-tag generation step 3: locate hash-tags by NER classes
    raw_autotags = []
    for (token, postag) in tokens_clean:

        ##---- Hash-tag generation step 3.1 : Convert jieba POS tags into NER class
        if token in existing_ner_df['word'].values:

            ner_class = existing_ner_df[existing_ner_df['word'] ==
                                        token]['ner'].values[0]
            raw_autotags.append((token, ner_class))
    # print((token,ner_class))
        elif token in jieba_dict_df['word'].values:
            # print('cannot process: ',token,postag)
            try:
                ner_class = pos2ner[jieba_dict_df[jieba_dict_df['word'] ==
                                                  token]['pos'].values[0]]
                raw_autotags.append((token, ner_class))
            except:
                print(token, postag)
        #print((token, ner_class))
        # print(jieba_dict_df[jieba_dict_df['word']==token]['pos'].values[0])
        else:
            #print((token,postag))
            continue
        ##--- Hash-tag generation step 3.2 : filter of NER class
    del tokens_clean

    autotags = []
    for (token, ner_class) in raw_autotags:

        if len(token) >= 5 and ner_class not in ['J']:
            autotags.append((token, ner_class))
        elif len(token) > 2 and len(token) < 5 and ner_class not in [
                'TITLE', 'J'
        ]:
            #print(token,ner_class)
            autotags.append((token, ner_class))

        elif len(token) == 2 and ner_class not in [
                'PRODUCT', 'TITLE', 'J', 'TERM'
        ]:
            #print(token,ner_class)
            autotags.append((token, ner_class))

    del raw_autotags

    autotags_df = pd.DataFrame(autotags, columns=['hashtags', 'type'])
    autotags_df.drop_duplicates('hashtags', inplace=True)
    return autotags_df
Esempio n. 9
0
    def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, "r", encoding="utf-8") as dic:
            for word in dic:
                jieba.add_word(word.strip("\n"))