Ejemplo n.º 1
0
    def batch_segment_sentences(self, sentences):
        '''
            对多个句子批量分词

        :param sentences: array-like
        :return:
        '''
        self.jieba_util = Jieba_Util()
        segmented_sentences = map(self.segment_sentence, sentences)
        return segmented_sentences
Ejemplo n.º 2
0
 def __init__(self):
     super(DataUtil, self).__init__()
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x: jutil.seg(
         x, sep='', remove_url=False)
     self.get_sentence_length = lambda x: len(
         jutil.seg(
             x,
             sep=' ',
             full_mode=False,
             remove_stopword=False,
             replace_number=False,
             lowercase=True,
             zhs2zht=True,
             remove_url=True,
             HMM=False,
         ).split())
Ejemplo n.º 3
0
    def remove_repet_data(self, data):
        '''
            去除重复的的句子(去除标点符号后一样的句子则算一样)
                1. 初始化jieba分词,并用分词去除标点符号
                2. 去重处理

        :param data:
        :return:
        '''

        jutil = Jieba_Util(verbose=0)
        # 去除标点符号
        remove_sentence_punctuation = lambda x: jutil.seg(
            x, sep='', remove_url=False)

        labels = []
        sentences = []
        for label, group in data.groupby(by=[u'LABEL']):
            # print(label,len(group),len(group[u'SENTENCE'].unique()))
            # 去除该类别之后的句子和句子数
            # print(group[u'SENTENCE'])
            # print(group[u'SENTENCE'].apply(remove_sentence_punctuation))
            norepet_sentcence_set = set()
            sentences_after_rm_rep = []
            for item in group[u'SENTENCE'].as_matrix():
                seged_sentence = remove_sentence_punctuation(item)
                if seged_sentence not in norepet_sentcence_set:
                    norepet_sentcence_set.add(seged_sentence)
                    sentences_after_rm_rep.append(item)
                    # print(seged_sentence)
                else:
                    pass
                    # print(item)
            num_after_rm_rep = len(sentences_after_rm_rep)
            sentences.extend(sentences_after_rm_rep)
            labels.extend([label] * num_after_rm_rep)

        # print(len(labels))
        # print(len(sentences))
        return pd.DataFrame(data={'LABEL': labels, 'SENTENCE': sentences})
Ejemplo n.º 4
0
def test2():
    input_file1 = './sample_data/v2.3_train_Sa_891.csv'

    data = pd.read_csv(input_file1,
                       encoding='utf8',
                       sep='\t',
                       index_col=0,
                       header=0)

    data = data[data['LABEL'] != u'其它#其它']
    data = data[data['LABEL'] != u'其它#捣乱']
    print(data.head())
    # 分词
    jieba_util = Jieba_Util()
    segment_sentence = lambda x: jieba_util.iter_each_word(
        sentence=x,
        sep=' ',
        need_segmented=True,
        full_mode=False,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
    )
    data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix()
    sentences = data['WORDS'].as_matrix()
    print '句子数:%d' % sentences.shape
    # print(sentences[-1])
    # quit()
    util = Word2vecUtil(size=50, train_method='cbow')
    util.train(sentences)
    util.print_model_descibe()

    most_similar_words = util.model.most_similar(u'机')
    most_similar_words = util.model.most_similar(u'喜')
    print ','.join([i for i, j in most_similar_words])
    util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
Ejemplo n.º 5
0
    def __init__(self,
                 need_segmented=True,
                 verbose=0,
                 full_mode=True,
                 feature_type='seg',
                 remove_stopword=True,
                 replace_number=True,
                 lowercase=True,
                 zhs2zht=True,
                 remove_url=True,
                 sentence_padding_length=7,
                 padding_mode='center',
                 add_unkown_word=True,
                 to_onehot_array=False,
                 word2vec_to_solve_oov=False,
                 **kwargs):
        """
            Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐)
            1. 初始化参数
            2. build feature encoder

            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机
                - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机
            :type feature_type: str
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN)
            :type add_unkown_word: bool
            :param sentence_padding_length:  句子的补齐(截断)长度,默认为7
            :type sentence_padding_length: int
            :param padding_mode:  句子的补齐(截断)模式,有四种模式:
                                        1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。
                                        4. none:不补齐。
            :type padding_mode: str
            :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引,
            :type to_onehot_array: bool
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param kwargs:
                - word2vec_model_file_path:
                - vocabulary_including_test_set: (default,True)
                - update_dictionary: (default,True)
                - 等

        """
        self.full_mode = full_mode
        self.feature_type = feature_type
        self.remove_stopword = remove_stopword
        self.verbose = verbose
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.add_unkown_word = add_unkown_word
        self.sentence_padding_length = sentence_padding_length
        self.padding_mode = padding_mode
        self.to_onehot_array = to_onehot_array
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.padding_mode in [
            'center', 'left', 'right', 'none'
        ], 'padding mode 只能取: center,left,right,none'
        assert self.feature_type in [
            'word', 'seg', 'word_seg', 'word_seg_concat'
        ], 'feature type 只能取: word,seg和word_seg'

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # UNKOWN字符的索引
        self.unknow_token_index = None
        # PADDING字符的索引
        self.padding_token_index = None

        # region NOTE: 这些变量不再维护,因为消耗内存
        # 原始训练数据
        # self.train_data = None
        # 切完词的句子
        # self.segmented_sentences = None
        # 训练库句子的字典索引形式
        # self.train_index = None
        # 训练库句子的补齐的字典索引形式
        # self.train_padding_index = None
        # 训练库句子装成onehot array
        # endregion
        self.train_onehot_array = None
        # word2vec 模型
        self.word2vec_model = None
        if word2vec_to_solve_oov:
            assert kwargs.has_key(
                'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path'
            # 加载word2vec模型
            w2v_util = Word2vecUtil()
            self.word2vec_model = w2v_util.load(
                kwargs.get('word2vec_model_file_path'))

        if verbose > 1:
            logging.debug('build feature encoder...')
            print('build feature encoder...')
Ejemplo n.º 6
0
 def __init__(self):
     jutil = Jieba_Util(verbose=0)
     self.remove_sentence_punctuation = lambda x: jutil.seg(
         x, sep='', remove_url=False)
Ejemplo n.º 7
0
 def __init__(self):
     # 初始化jieba工具
     self.jieba_util = Jieba_Util()
Ejemplo n.º 8
0
"""
    Author:  'jdwang'
    Date:    'create date: 2016-07-16'
    Email:   '*****@*****.**'
    Describe: 
"""
from __future__ import print_function

import numpy as np
import pandas as pd
import logging
import timeit

from data_processing_util.jiebanlp.jieba_util import Jieba_Util

jutil = Jieba_Util(verbose=0)
remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False)

# 统计 进入协处理的对话段数

ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv'

ch2r_dialogue = pd.read_csv(
    ch2r_dialogue_file_path,
    sep='\t',
    encoding='utf8',
    header=0,
)

user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
Ejemplo n.º 9
0
    def __init__(
            self,
            # rand_seed=1337,
            verbose=0,
            need_segmented=True,
            full_mode=True,
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            feature_method='bow',
            feature_type='seg',
            max_features=None,
            word2vec_to_solve_oov=False,
            save_middle_result=False,
            **kwargs):
        '''
            1. 初始化参数,并验证参数合法性
            2. build feature encoder

            :param verbose: 数值越大,输出越详细
            :type verbose: int
            :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好'].
            :type need_segmented: bool
            :param full_mode: jieba分词选项,是否使用 full mode,默认为True
            :type full_mode: bool
            :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True
            :type remove_stopword: bool
            :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type replace_number: bool
            :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True
            :type lowercase: bool
            :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True
            :type zhs2zht: bool
            :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True
            :type remove_url: bool
            :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法
            :type feature_method: str
            :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。
                - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机
                - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机
                - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机
            :type feature_type: str
            :param max_features: 模型设置选项,特征选择的最大特征词数
            :type max_features: int
            :param word2vec_to_solve_oov: 使用word2vec扩展oov词
            :type word2vec_to_solve_oov: bool
            :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭!
            :type save_middle_result: bool
            :param kwargs: 支持 word2vec_model_file_path等
            :type kwargs: dict


        '''
        # self.rand_seed = rand_seed
        self.save_middle_result = save_middle_result
        self.verbose = verbose
        self.full_mode = full_mode
        self.remove_stopword = remove_stopword
        self.need_segmented = need_segmented
        self.replace_number = replace_number
        self.lowercase = lowercase
        self.zhs2zht = zhs2zht
        self.remove_url = remove_url
        self.feature_method = feature_method
        self.feature_type = feature_type
        self.max_features = max_features
        self.word2vec_to_solve_oov = word2vec_to_solve_oov
        self.kwargs = kwargs

        # 检验参数合法性
        assert self.feature_method in ['bow', 'tfidf'
                                       ], 'feature method 只能取: bow,tfidf'
        assert self.feature_type in ['word', 'seg', 'word_seg'
                                     ], 'feature type 只能取: word,seg和word_seg'

        if word2vec_to_solve_oov:
            # 加载word2vec模型
            if word2vec_to_solve_oov:
                assert kwargs.has_key('word2vec_model_file_path'
                                      ), '请提供 属性 word2vec_model_file_path'
                # 加载word2vec模型
                w2v_util = Word2vecUtil()
                self.word2vec_model = w2v_util.load(
                    kwargs.get('word2vec_model_file_path'))

        # 初始化jieba分词器
        if need_segmented:
            self.jieba_seg = Jieba_Util(verbose=self.verbose)

        # 特征编码器: bow or tf-idf transformer
        self.feature_encoder = None
        # 训练库提取出来的字典对象
        self.train_data_dict = None
        # 训练库提取出来的字典词汇列表
        self.vocabulary = None
        # 训练库提取出来的字典词汇个数
        self.vocabulary_size = None
        # 训练样例的个数
        self.train_data_count = 0

        # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存)
        if self.save_middle_result:
            # 原始训练数据
            self.train_data = None
            # 切完词的句子
            self.segmented_sentences = None
            # 训练句子特征
            self.train_features = None