Exemple #1
0
def peredata(content_data):
    data = content_data.tolist()
    train_label = []  # 用于存储每对句子的是否相似的标签信息1,0
    word_index_dic = {}  # 存储全部文档中出现的词及对应的编码
    sentences1_seg_list = [[]
                           for index in range(len(data))]  #存储第一个句子和第二个句子的分词结果
    sentences2_seg_list = [[] for index in range(len(data))]
    is_get_data_info = False  #flag set
    if is_get_data_info:
        get_data_info(data)

    qinghuaSeg = pkuseg.pkuseg()
    for i in range(len(data)):
        sen1 = data[i][0].split(',')[0]
        sen2 = data[i][0].split(',')[1]
        train_label.append(int(data[i][0].split(',')[2]))
        seg_content_data1 = qinghuaSeg.cut(sen1)
        seg_content_data2 = qinghuaSeg.cut(sen2)
        #将分词结果保存到对应数组
        sentences1_seg_list[i] = seg_content_data1
        sentences2_seg_list[i] = seg_content_data2
        #将出现的词编码,并将词与id的对应关系保存在word_index_dic
        for word1 in seg_content_data1:
            if word1 not in word_index_dic:
                word_index_dic[word1] = len(word_index_dic) + 1
        for word2 in seg_content_data2:
            if word2 not in word_index_dic:
                word_index_dic[word2] = len(word_index_dic) + 1
        if i % 1000 == 0:
            print('已经完成进度', i / len(data))
    if not os.path.exists('word_index_dict.json'):
        with open('word_index_dict.json', 'w', encoding='utf-8') as f:
            json.dump(word_index_dic, f, ensure_ascii=False)
    # save label info
    if not os.path.exists("train_label.npy"):
        np.save("train_label.npy", np.array(train_label))
    print('分词结束得到%d个词,分词结果及编码保存在word_bert_dict.json' %
          len(word_index_dic))  #6872
    print(len(train_label))
    print(train_label[0])

    sentences1_data = []
    sentences2_data = []
    for ssl1 in range(len(sentences1_seg_list)):
        word_vec_list1 = []
        for word in sentences1_seg_list[ssl1]:
            word_vec_list1.append(word_index_dic[word])
        sentences1_data.append(word_vec_list1)
    for ssl2 in range(len(sentences2_seg_list)):
        word_vec_list2 = []
        for word in sentences1_seg_list[ssl2]:
            word_vec_list2.append(word_index_dic[word])
        sentences2_data.append(word_vec_list2)
    if not os.path.exists("sentence_data1.npy"):
        np.save("train_data1.npy", sentences1_data)
    if not os.path.exists("sentence_data2.npy"):
        np.save("train_data2.npy", sentences2_data)
    return sentences1_data, sentences2_data, train_label
 def word_cut(self):
     seg = pkuseg.pkuseg()
     word_set = set()
     for text in self.texts:
         words = seg.cut(text)
         for word in words:
             if word not in word_set:
                 word_set.add(word)
     self.word_set = word_set
Exemple #3
0
 def __init__(self, ner_dict_path, stopword_file_path):
     self.ner_dict_path = ner_dict_path
     self.stopwords = set()
     self.pku = pkuseg.pkuseg()
     with open(stopword_file_path, 'r', encoding='utf-8') as file:
         for line in file:
             self.stopwords.add(line.strip())
     # self.ner = ner()
     jieba.load_userdict(ner_dict_path)
Exemple #4
0
 def __cut_corpus(self):
     if self.seg == 'jieba' or self.seg == None:
         jieba.load_userdict(self.user_word_path)
     elif self.seg == 'pkuseg':
         self.myseg = pkuseg.pkuseg(user_dict=self.my_word_list)
     corpus_cut = []
     for s in self.corpus:
         corpus_cut.append(self.__cut_str(s))
     return corpus_cut
Exemple #5
0
def pere_test_data(content_data):
    data = content_data.tolist()
    test_label = []  # 用于存储每对句子的是否相似的标签信息1,0
    word_index_dic = {}  # 存储全部文档中出现的词及对应的编码
    sentences1_seg_list = [[]
                           for index in range(len(data))]  # 存储第一个句子和第二个句子的分词结果
    sentences2_seg_list = [[] for index in range(len(data))]

    is_get_data_info = False  # flag SET
    if is_get_data_info:
        get_data_info(data)
    # 对于test data,采用train data时候得到的word_index_dict信息
    with open('word_index_dict.json', 'r', encoding='utf-8') as f:
        word_index_dic = json.load(f)
    qinghuaSeg = pkuseg.pkuseg()
    for i in range(len(data)):
        sen1 = data[i][0].split(',')[0]
        sen2 = data[i][0].split(',')[1]
        test_label.append(int(data[i][0].split(',')[2]))
        seg_content_data1 = qinghuaSeg.cut(sen1)
        seg_content_data2 = qinghuaSeg.cut(sen2)
        # 将分词结果保存到对应数组
        sentences1_seg_list[i] = seg_content_data1
        sentences2_seg_list[i] = seg_content_data2
        if i % 1000 == 0:
            print('已经完成进度', i / len(data))

    # save label info
    if not os.path.exists("test_label.npy"):
        np.save("test_label.npy", np.array(test_label))
    print(len(test_label))
    print(test_label[0])

    sentences1_data = []
    sentences2_data = []
    for ssl1 in range(len(sentences1_seg_list)):
        word_vec_list1 = []
        for word in sentences1_seg_list[ssl1]:
            if word in word_index_dic:
                word_vec_list1.append(word_index_dic[word])
            else:
                word_vec_list1.append(0)
        sentences1_data.append(word_vec_list1)
    for ssl2 in range(len(sentences2_seg_list)):
        word_vec_list2 = []
        for word in sentences1_seg_list[ssl2]:
            if word in word_index_dic:
                word_vec_list2.append(word_index_dic[word])
            else:
                word_vec_list2.append(0)
        sentences2_data.append(word_vec_list2)
    if not os.path.exists("test_data1.npy"):
        np.save("test_data1.npy", sentences1_data)
    if not os.path.exists("test_data2.npy"):
        np.save("test_data2.npy", sentences2_data)
    return sentences1_data, sentences2_data, test_label
Exemple #6
0
 def __init__(self, spo_files, pku_model_name='default', predicate=False):
     self.predicate = predicate
     self.spo_file_paths = [config.KGS.get(f, f) for f in spo_files]
     self.lookup_table = self._create_lookup_table()
     self.segment_vocab = list(
         self.lookup_table.keys()) + config.NEVER_SPLIT_TAG
     self.tokenizer = pkuseg.pkuseg(model_name=pku_model_name,
                                    postag=False,
                                    user_dict=self.segment_vocab)
     self.special_tags = set(config.NEVER_SPLIT_TAG)
Exemple #7
0
 def load_pkuseg_model(path):
     try:
         import pkuseg
     except ImportError:
         if self.use_pkuseg:
             raise ImportError(
                 "pkuseg not installed. To use this model, " +
                 _PKUSEG_INSTALL_MSG)
     if path.exists():
         self.pkuseg_seg = pkuseg.pkuseg(path)
 def __init__(
         self,
         opList,  #[原始名称,可能名称]列表
         user_dict=None):  #用户自定义词典
     if user_dict is None:
         self.seg = pkuseg.pkuseg(user_dict=user_dict, postag=False)
     else:
         self.seg = pkuseg.pkuseg(postag=False)
     self.instructions = []
     corpus = []
     for row in opList:
         question = row[0]
         corpusRow = []
         for word in self.seg.cut(question):
             corpusRow.append(word)
         corpus.append(corpusRow)
         self.instructions.append(question)
     self.bm25Model = bm25.BM25(corpus)
     self.corpus = corpus
def zh_word_seg_by_pku(list_of_sentences, user_dict=[]):
    """
    Tokenize Chinese words by pkuseg
    :params
        list_of_sentences (list): [ sentence_a (str), sentence_b (str), ... ]
        user_dict (list): customized dictionary, e.g., [ '你好', '朋友', ... ]
    """
    user_dict = user_dict if user_dict else 'default'
    seg = pkuseg.pkuseg(user_dict)
    return list(map(lambda x: seg.cut(x), list_of_sentences))
Exemple #10
0
    def _prepare(self):
        # 词性预处理
        # 词性参考 https://github.com/lancopku/pkuseg-python/blob/master/tags.txt
        # jio.util.pkuseg_postag_loader()
        self.pos_name = set(pkuseg_postag_loader().keys())
        self.pos_exception = set(['u', 'p', 'c', 'y', 'e', 'o', 'w'])
        self.loose_pos_name = self.pos_name - self.pos_exception
        self.strict_pos_name = [
            'a', 'n', 'j', 'nr', 'ns', 'nt', 'nx', 'nz', 'ad', 'an', 'vn',
            'vd', 'vx'
        ]

        # 去除冗余短语的规则
        self.redundent_strict_pattern = re.compile(
            '[\*\|`\;:丨-\<\>]')  # 有一个字符即抛弃
        self.redundent_loose_pattern = re.compile(
            '[/\d\.\-:=a-z+,%]+')  # 全部是该字符即抛弃
        self.extra_date_ptn = re.compile('\d{1,2}[月|日]')

        # 加载 idf,计算其 oov 均值
        self.idf_dict = idf_loader()
        self.median_idf = sorted(self.idf_dict.values())[len(self.idf_dict) //
                                                         2]
        self.seg = pkuseg.pkuseg(postag=True)  # 北大分词器

        # 短语长度权重字典,调整绝大多数的短语要位于 2~6 个词之间
        # 根据人工抽取的关键短语结果,短语词长有一个泊松分布,而根据 idf 和 lda 概率的结果,也有一个
        # 分布,两者之间存在偏差,因此,直接对短语长度分布采用权重进行调节,使抽取分布逼近人工的分布。
        self.phrases_length_control_dict = {
            1: 1,
            2: 5.6,
            3: 1.1,
            4: 2.0,
            5: 0.7,
            6: 0.9,
            7: 0.48,
            8: 0.43,
            9: 0.24,
            10: 0.15,
            11: 0.07,
            12: 0.05
        }
        self.phrases_length_control_none = 0.01  # 在大于 7 时选取

        # 短语词性组合权重字典
        with open(os.path.join(DIR_PATH, 'pos_combine_weights.json'),
                  'r',
                  encoding='utf8') as f:
            self.pos_combine_weights_dict = json.load(f)

        # 读取停用词文件
        self.stop_words = stopwords_loader()

        # 加载 lda 模型参数
        self._lda_prob_matrix()
    def __init__(self,
                 pretrain_path,
                 max_length,
                 hidden_size,
                 att_dim,
                 is_zh=False):
        super(AttBiLSTMEncoder, self).__init__()
        self.is_zh = is_zh
        if not self.is_zh:
            # English
            self.token2idx = json.load(
                open(
                    os.path.join(pretrain_path, "att-bi-lstm",
                                 "token2idx.json"), "r"))
            word_vec = torch.from_numpy(
                np.load(
                    os.path.join(pretrain_path, "att-bi-lstm",
                                 "word_vec.npy")))
        else:
            # Chinese
            self.token2idx = json.load(
                open(
                    os.path.join(pretrain_path, "att-bi-lstm-zh",
                                 "token2idx.json"), "r"))
            word_vec = torch.from_numpy(
                np.load(
                    os.path.join(pretrain_path, "att-bi-lstm-zh",
                                 "word_vec.npy")))
            import pkuseg
            self.seg = pkuseg.pkuseg()
        self.word_count, self.word_vec_dim = word_vec.shape[0], word_vec.shape[
            1]

        # Unknown, Blank
        self.unk_idx, self.blk_idx = self.word_count, self.word_count + 1
        unk = torch.randn(1, self.word_vec_dim,
                          dtype=torch.double) / math.sqrt(self.word_vec_dim)
        blk = torch.zeros(1, self.word_vec_dim, dtype=torch.double)

        # Embedding layer
        self.word_embedding = nn.Embedding(self.word_count + 2,
                                           self.word_vec_dim,
                                           padding_idx=self.blk_idx)
        self.word_embedding.weight.data.copy_(
            torch.cat((word_vec, unk, blk), 0))

        # Self-Att Bi-LSTM layer
        self.bilstm = nn.LSTM(self.word_vec_dim,
                              hidden_size,
                              batch_first=True,
                              bidirectional=True)
        self.att1 = nn.Linear(2 * hidden_size, att_dim, bias=False)
        self.att2 = nn.Linear(att_dim, 1, bias=False)
        self.max_length = max_length
Exemple #12
0
def load_file(path):
    en = []
    cn = []
    seg = pkuseg.pkuseg()
    with open(path, 'r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
            cn.append(["BOS"] + seg.cut(line[1]) + ["EOS"])

    return en, cn
Exemple #13
0
def getcityname(text):
    seg = pkuseg.pkuseg()   #以默认配置加载模型
    srcArray = seg.cut(text)    #进行分词
    print(srcArray)

    for val in cityname:
        if val in srcArray:
            print('faxian'+val)
            return val
    print('not find')
    return None
Exemple #14
0
    def __init__(self, config):
        self.config = config
        self.seg = pkuseg.pkuseg() if self.config.use_word else None

        # load vocab if exist else create
        vocab_path = config.vocab_path
        print('Loading vocab from', vocab_path, ' ...')
        self.vocab = pkl.load(
            open(vocab_path,
                 'rb')) if os.path.exists(vocab_path) else self.build_vocab()
        print('Complete! Vocab size: {}'.format(len(self.vocab)))
Exemple #15
0
def calcTop50():
    npath = os.getcwd()
    filename = npath + '\\ntlk1\\area1.txt'
    txt = open(filename, encoding="utf-8").read()
    seg = pkuseg.pkuseg()

    #ctxt = jieba.lcut(txt)
    ctxt = seg.cut(txt)
    items = getTop(ctxt)
    #printTop(items, 50)
    ShowImgHotWord(ctxt, 50)
Exemple #16
0
 def __init__(self, config, norm=True, postag=True, **kwargs):
     """
     @param: user_dict
     """
     self.config = config
     self.norm   = norm
     self.postag = postag
     self.en_handler = spacy.load("en_core_web_sm")
     self.seghandler = pkuseg.pkuseg(user_dict=config['tokenizer']['user_dict'],
                                     postag=postag, **kwargs)
     self.init()
Exemple #17
0
def make_articles_from_contents(article_names: List[str],
                                article_contents: List[str]):

    seg = pkuseg.pkuseg()
    cutter = lambda sentence: seg.cut(sentence)

    articles = list()
    for i in tqdm(range(len(article_names))):
        articles.append(
            Article(name=article_names[i], terms=cutter(article_contents[i])))

    return articles
def test_segment_sentence_list():
    # 一般性测试方法
    seg_tool = pkuseg.pkuseg(postag=True)
    seg = WordSegmentation(is_lower=False,
                           is_use_stop_words=False,
                           is_use_word_tags_filter=False)

    expected = []
    for sentence in SENTENCE_LIST:
        sentence_cutted = seg_tool.cut(sentence)
        sentence_cutted = [item[0] for item in sentence_cutted]
        expected.append(sentence_cutted)
    assert seg.segment_sentence_list(SENTENCE_LIST) == expected

    # 测试停用词滤除方法
    stop_words_vocab = ["根据", "了", ")", "("]
    seg = WordSegmentation(is_lower=False,
                           is_use_stop_words=True,
                           is_use_word_tags_filter=False,
                           stop_words_vocab=stop_words_vocab)

    expected = []
    for sentence in SENTENCE_LIST:
        sentence_cutted = seg_tool.cut(sentence)
        sentence_cutted = [item[0] for item in sentence_cutted \
                        if item[0] not in stop_words_vocab]
        expected.append(sentence_cutted)
    assert seg.segment_sentence_list(SENTENCE_LIST) == expected

    # 测试基于词性的滤除方法
    allow_word_tags = ["n", "v"]
    seg = WordSegmentation(is_lower=False,
                           is_use_stop_words=False,
                           is_use_word_tags_filter=True,
                           allow_word_tags=allow_word_tags)

    expected = []
    for sentence in SENTENCE_LIST:
        sentence_cutted = seg_tool.cut(sentence)
        sentence_cutted = [item[0] for item in sentence_cutted \
                        if item[1] in allow_word_tags]
        expected.append(sentence_cutted)
    assert seg.segment_sentence_list(SENTENCE_LIST) == expected

    # 测试大小写是否正常转换
    seg = WordSegmentation(is_lower=True)

    expected = []
    for sentence in SENTENCE_LIST:
        sentence_cutted = seg_tool.cut(sentence)
        sentence_cutted = [item[0].lower() for item in sentence_cutted]
        expected.append(sentence_cutted)
    assert seg.segment_sentence_list(SENTENCE_LIST) == expected
Exemple #19
0
def pkuseg_pos(string):
    print('PkuSeg的分词和词性标注:')
    num = len(string)
    print(num)
    start_time = datetime.now()
    for s in string:
        seg = pkuseg.pkuseg(postag=True)  # 加载模型,给定用户词典
        pos_list = seg.cut(s)
    all_time = (datetime.now() - start_time).total_seconds()
    avg = all_time / num
    print('pos_tag time used: {} sec'.format(avg))
    print('\n\n')
Exemple #20
0
    def from_bytes(self, data, **kwargs):
        pkuseg_features_b = b""
        pkuseg_weights_b = b""
        pkuseg_processors_data = None

        def deserialize_pkuseg_features(b):
            nonlocal pkuseg_features_b
            pkuseg_features_b = b

        def deserialize_pkuseg_weights(b):
            nonlocal pkuseg_weights_b
            pkuseg_weights_b = b

        def deserialize_pkuseg_processors(b):
            nonlocal pkuseg_processors_data
            pkuseg_processors_data = srsly.msgpack_loads(b)

        deserializers = OrderedDict((
            ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
            ("pkuseg_features", deserialize_pkuseg_features),
            ("pkuseg_weights", deserialize_pkuseg_weights),
            ("pkuseg_processors", deserialize_pkuseg_processors),
        ))
        util.from_bytes(data, deserializers, [])

        if pkuseg_features_b and pkuseg_weights_b:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
                with open(tempdir / "features.pkl", "wb") as fileh:
                    fileh.write(pkuseg_features_b)
                with open(tempdir / "weights.npz", "wb") as fileh:
                    fileh.write(pkuseg_weights_b)
                try:
                    import pkuseg
                except ImportError:
                    raise ImportError(
                        "pkuseg not installed. To use this model, " +
                        _PKUSEG_INSTALL_MSG)
                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
            if pkuseg_processors_data:
                (
                    user_dict,
                    do_process,
                    common_words,
                    other_words,
                ) = pkuseg_processors_data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
                self.pkuseg_seg.postprocesser.other_words = set(other_words)

        return self
Exemple #21
0
 def __init__(self, ner_dict_path, stopword_file_path):
     self.ner_dict_path = ner_dict_path
     self.stopwords = set()
     self.pku = pkuseg.pkuseg()
     with open(stopword_file_path, 'r', encoding='utf-8') as file:
         for line in file:
             self.stopwords.add(line.strip())
     # self.ner = ner()
     jieba.load_userdict(ner_dict_path)
     jieba.analyse.set_stop_words(stopword_file_path)
     self.not_word = '[\n\t,,。`……·\u200b!!??“”""' '~::;;{}+-——=、/.()(|)%^&*@#$ <>《》【】[]\\]'
     self.key_word_pos = ('ns', 'n', 'vn', 'v', 'l', 'j', 'nr', 'nrt', 'nt',
                          'nz', 'nrfg', 'an', 's')
Exemple #22
0
def tokenize_words(doc, filter_exist=None):
    # input: doc is a long string
    #        filter_exist, jieba or pkuseg
    # output: dict_doc = {word: count, word: count, ...}
    #         cws_model, a words splitting model, jieba, pkuseg, or None
    if filter_exist == "pkuseg":
        print("Use pkuseg tokenizer ...")
        cws_model = pkuseg.pkuseg()
    else:
        print("Use jieba tokenizer ...")
        cws_model = jieba
    dict_doc = dict(Counter(list(cws_model.cut(doc))))
    return dict_doc, cws_model
Exemple #23
0
    def __init__(self, print_seg=False):
        self.seg = pkuseg.pkuseg(postag=True)
        self.print_seg = print_seg

        with open(os.path.join(os.path.dirname(__file__), "xdic.pkl"),
                  "rb") as f:
            self.dic = pickle.load(f)
        with open(os.path.join(os.path.dirname(__file__), "postag"), "r") as f:
            self.postag = dict((k.split(" ")[0], k.split(" ")[2])
                               for k in f.read().splitlines())
        with open(os.path.join(os.path.dirname(__file__), "t2s"), "r") as f:
            self.t2s = dict((k.split("\t")[0], k.split("\t")[1])
                            for k in f.read().splitlines())
Exemple #24
0
    def pkuseg_cut(self):
        model_path = r'D:\Anaconda\python\envs\spider\Lib\site-packages\pkuseg\models\default\tourism'
        seg = pkuseg.pkuseg(model_name=model_path)
        lines = []
        df = pd.read_excel(self.excel_path)
        sentences = df['sentences']
        for sentence in sentences:
            if sentence is not np.nan:
                # print(sentence)
                cut = seg.cut(sentence)
                lines.append(' '.join(cut))

        # print(lines)
        self.save_to_excel(lines)
Exemple #25
0
def get_pkuseg_result(sentences):
    """
    Ref to: https://github.com/lancopku/pkuseg-python
    Install by: `pip3 install pkuseg`
    You should noticed that pkuseg-python only support python3
    """
    import pkuseg
    seg = pkuseg.pkuseg()
    preds = []
    for sentence in sentences:
        sent_seg  = " ".join(seg.cut(sentence))
        sent_seg = to_unicode(sent_seg)
        preds.append(sent_seg)
    return preds
def encode_test_data1(data):
    '''
    为bert lstm pooling mlp造的100条测试数据
    :param data:
    :return: test_data1.npy test_data2.npy test_label.npy
    '''
    test_label = []  # 用于存储每对句子的是否相似的标签信息1,0
    sentences1_seg_list = [[]
                           for index in range(len(data))]  # 存储第一个句子和第二个句子的分词结果
    sentences2_seg_list = [[] for index in range(len(data))]
    word_index_dic = {}
    qinghuaSeg = pkuseg.pkuseg()
    for i in range(len(data)):
        sen1 = data[i][0].split(',')[0]
        sen2 = data[i][0].split(',')[1]
        test_label.append(int(data[i][0].split(',')[2]))
        seg_content_data1 = qinghuaSeg.cut(sen1)
        seg_content_data2 = qinghuaSeg.cut(sen2)
        # 将分词结果保存到对应数组
        sentences1_seg_list[i] = seg_content_data1
        sentences2_seg_list[i] = seg_content_data2
        # print('已经完成进度', i / 100)

    # save label info
    if not os.path.exists("test_label.npy"):
        np.save("test_label.npy", np.array(test_label))
    with open('word_index_dict.json', 'r', encoding='utf-8') as f:
        word_index_dic = json.load(f)
    sentences1_data = []
    sentences2_data = []
    for ssl1 in range(len(sentences1_seg_list)):
        word_vec_list1 = []
        for word in sentences1_seg_list[ssl1]:
            if word in word_index_dic:
                word_vec_list1.append(word_index_dic[word])
            else:
                word_vec_list1.append(0)
        sentences1_data.append(word_vec_list1)
    for ssl2 in range(len(sentences2_seg_list)):
        word_vec_list2 = []
        for word in sentences1_seg_list[ssl2]:
            if word in word_index_dic:
                word_vec_list2.append(word_index_dic[word])
            else:
                word_vec_list2.append(0)
        sentences2_data.append(word_vec_list2)
    if not os.path.exists("test_data1.npy"):
        np.save("test_data1.npy", sentences1_data)
    if not os.path.exists("test_data2.npy"):
        np.save("test_data2.npy", sentences2_data)
Exemple #27
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
Exemple #28
0
def read_test_corpus(file_path):
    """读取语料
    :param file_path:
    :param type:
    :return:
    """
    src_data = []
    seg = pkuseg.pkuseg()
    with codecs.open(file_path, 'r', encoding='utf-8',
                     errors='ignore') as fout:
        for line in tqdm(fout.readlines(), desc='reading corpus'):
            if line is not None:
                src_data.append(seg.cut(line))
    return src_data
Exemple #29
0
def cut_news():
    mp_news_word_lst = {}
    seg = pkuseg.pkuseg(model_name='news')  # 程序会自动下载所对应的细领域模型
    for news_id, (news_title, news_content,
                  news_time) in tqdm(mp_news_txt.items()):
        text = news_title + " " + news_content  # 新闻标题和正文都用
        # text = news_title # 只用新闻标题
        word_list = list(
            filter(lambda x: len(x) > 0 and x not in stop_words,
                   map(my_utils.clean_word, seg.cut(text))))
        mp_news_word_lst[news_id] = word_list

    my_utils.write_pkl(mp_news_word_lst,
                       config['DEFAULT']['path_all_news_word_list'])
Exemple #30
0
def segment_file(src_file, tgt_file):
    seg = pkuseg.pkuseg()
    with open(src_file) as src:
        all_json_data = [json.loads(line) for line in src]

    with multiprocessing.Pool(processes=8) as pool:
        segmented = list(
            pool.map(partial(segment_single_item, seg),
                     all_json_data,
                     chunksize=1024))

    with open(tgt_file, 'w') as tgt:
        for s in segmented:
            tgt.write(json.dumps(s, ensure_ascii=False) + '\n')
Exemple #31
0
import jieba_fast as jieba
import pkuseg

seg = pkuseg.pkuseg()

class docA(object):
	def __init__(self, title: str, content: str, *args, author: str, year: int, category: str, tags: str,stem_mode:'str'='jieba' ,**kw):
		self.title = title
		self.content = content

		self.author = author
		self.year = year
		self.category = category
		self.tags = tags

		# self.stem =
		self.stem = self.stemInit(stem_mode)

		# self.misc = misc

	def stemInit(self,stem_mode):
		if stem_mode == 'jieba':
			return ' '.join(jieba.cut_for_search(self.content))
		elif stem_mode == 'pkuseg':
			return ' '.join(seg.cut(self.content))