def batch_segment_sentences(self, sentences): ''' 对多个句子批量分词 :param sentences: array-like :return: ''' self.jieba_util = Jieba_Util() segmented_sentences = map(self.segment_sentence, sentences) return segmented_sentences
def __init__(self): super(DataUtil, self).__init__() jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False) self.get_sentence_length = lambda x: len( jutil.seg( x, sep=' ', full_mode=False, remove_stopword=False, replace_number=False, lowercase=True, zhs2zht=True, remove_url=True, HMM=False, ).split())
def remove_repet_data(self, data): ''' 去除重复的的句子(去除标点符号后一样的句子则算一样) 1. 初始化jieba分词,并用分词去除标点符号 2. 去重处理 :param data: :return: ''' jutil = Jieba_Util(verbose=0) # 去除标点符号 remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False) labels = [] sentences = [] for label, group in data.groupby(by=[u'LABEL']): # print(label,len(group),len(group[u'SENTENCE'].unique())) # 去除该类别之后的句子和句子数 # print(group[u'SENTENCE']) # print(group[u'SENTENCE'].apply(remove_sentence_punctuation)) norepet_sentcence_set = set() sentences_after_rm_rep = [] for item in group[u'SENTENCE'].as_matrix(): seged_sentence = remove_sentence_punctuation(item) if seged_sentence not in norepet_sentcence_set: norepet_sentcence_set.add(seged_sentence) sentences_after_rm_rep.append(item) # print(seged_sentence) else: pass # print(item) num_after_rm_rep = len(sentences_after_rm_rep) sentences.extend(sentences_after_rm_rep) labels.extend([label] * num_after_rm_rep) # print(len(labels)) # print(len(sentences)) return pd.DataFrame(data={'LABEL': labels, 'SENTENCE': sentences})
def test2(): input_file1 = './sample_data/v2.3_train_Sa_891.csv' data = pd.read_csv(input_file1, encoding='utf8', sep='\t', index_col=0, header=0) data = data[data['LABEL'] != u'其它#其它'] data = data[data['LABEL'] != u'其它#捣乱'] print(data.head()) # 分词 jieba_util = Jieba_Util() segment_sentence = lambda x: jieba_util.iter_each_word( sentence=x, sep=' ', need_segmented=True, full_mode=False, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix() sentences = data['WORDS'].as_matrix() print '句子数:%d' % sentences.shape # print(sentences[-1]) # quit() util = Word2vecUtil(size=50, train_method='cbow') util.train(sentences) util.print_model_descibe() most_similar_words = util.model.most_similar(u'机') most_similar_words = util.model.most_similar(u'喜') print ','.join([i for i, j in most_similar_words]) util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in [ 'center', 'left', 'right', 'none' ], 'padding mode 只能取: center,left,right,none' assert self.feature_type in [ 'word', 'seg', 'word_seg', 'word_seg_concat' ], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key( 'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...')
def __init__(self): jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False)
def __init__(self): # 初始化jieba工具 self.jieba_util = Jieba_Util()
""" Author: 'jdwang' Date: 'create date: 2016-07-16' Email: '*****@*****.**' Describe: """ from __future__ import print_function import numpy as np import pandas as pd import logging import timeit from data_processing_util.jiebanlp.jieba_util import Jieba_Util jutil = Jieba_Util(verbose=0) remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False) # 统计 进入协处理的对话段数 ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv' ch2r_dialogue = pd.read_csv( ch2r_dialogue_file_path, sep='\t', encoding='utf8', header=0, ) user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
def __init__( self, # rand_seed=1337, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=None, word2vec_to_solve_oov=False, save_middle_result=False, **kwargs): ''' 1. 初始化参数,并验证参数合法性 2. build feature encoder :param verbose: 数值越大,输出越详细 :type verbose: int :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法 :type feature_method: str :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机 :type feature_type: str :param max_features: 模型设置选项,特征选择的最大特征词数 :type max_features: int :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭! :type save_middle_result: bool :param kwargs: 支持 word2vec_model_file_path等 :type kwargs: dict ''' # self.rand_seed = rand_seed self.save_middle_result = save_middle_result self.verbose = verbose self.full_mode = full_mode self.remove_stopword = remove_stopword self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.feature_method = feature_method self.feature_type = feature_type self.max_features = max_features self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.feature_method in ['bow', 'tfidf' ], 'feature method 只能取: bow,tfidf' assert self.feature_type in ['word', 'seg', 'word_seg' ], 'feature type 只能取: word,seg和word_seg' if word2vec_to_solve_oov: # 加载word2vec模型 if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path' ), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 特征编码器: bow or tf-idf transformer self.feature_encoder = None # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # 训练样例的个数 self.train_data_count = 0 # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: # 原始训练数据 self.train_data = None # 切完词的句子 self.segmented_sentences = None # 训练句子特征 self.train_features = None