def batch_segment_sentences(self, sentences): ''' 对多个句子批量分词 :param sentences: array-like :return: ''' self.jieba_util = Jieba_Util() segmented_sentences = map(self.segment_sentence, sentences) return segmented_sentences
def __init__(self): super(DataUtil, self).__init__() jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False) self.get_sentence_length = lambda x: len(jutil.seg(x, sep=' ', full_mode=False, remove_stopword=False, replace_number=False, lowercase=True, zhs2zht=True, remove_url=True, HMM=False, ).split())
def __init__(self): super(DataUtil, self).__init__() jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False) self.get_sentence_length = lambda x: len( jutil.seg( x, sep=' ', full_mode=False, remove_stopword=False, replace_number=False, lowercase=True, zhs2zht=True, remove_url=True, HMM=False, ).split())
def batch_segment_sentences(self,sentences): ''' 对多个句子批量分词 :param sentences: array-like :return: ''' self.jieba_util = Jieba_Util() segmented_sentences = map(self.segment_sentence,sentences) return segmented_sentences
def remove_repet_data(self,data): ''' 去除重复的的句子(去除标点符号后一样的句子则算一样) 1. 初始化jieba分词,并用分词去除标点符号 2. 去重处理 :param data: :return: ''' jutil = Jieba_Util(verbose=0) # 去除标点符号 remove_sentence_punctuation = lambda x:jutil.seg(x,sep='',remove_url=False) labels = [] sentences = [] for label,group in data.groupby(by=[u'LABEL']): # print(label,len(group),len(group[u'SENTENCE'].unique())) # 去除该类别之后的句子和句子数 # print(group[u'SENTENCE']) # print(group[u'SENTENCE'].apply(remove_sentence_punctuation)) norepet_sentcence_set = set() sentences_after_rm_rep = [] for item in group[u'SENTENCE'].as_matrix(): seged_sentence = remove_sentence_punctuation(item) if seged_sentence not in norepet_sentcence_set: norepet_sentcence_set.add(seged_sentence) sentences_after_rm_rep.append(item) # print(seged_sentence) else: pass # print(item) num_after_rm_rep = len(sentences_after_rm_rep) sentences.extend(sentences_after_rm_rep) labels.extend([label]*num_after_rm_rep) # print(len(labels)) # print(len(sentences)) return pd.DataFrame(data={'LABEL':labels,'SENTENCE':sentences})
def test2(): input_file1 = './sample_data/v2.3_train_Sa_891.csv' data = pd.read_csv(input_file1, encoding='utf8', sep='\t', index_col=0, header=0) data = data[data['LABEL'] != u'其它#其它'] data = data[data['LABEL'] != u'其它#捣乱'] print(data.head()) # 分词 jieba_util = Jieba_Util() segment_sentence = lambda x: jieba_util.iter_each_word( sentence=x, sep=' ', need_segmented=True, full_mode=False, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix() sentences = data['WORDS'].as_matrix() print '句子数:%d' % sentences.shape # print(sentences[-1]) # quit() util = Word2vecUtil(size=50, train_method='cbow' ) util.train(sentences) util.print_model_descibe() most_similar_words = util.model.most_similar(u'机') most_similar_words = util.model.most_similar(u'喜') print ','.join([i for i, j in most_similar_words]) util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
def remove_repet_data(self, data): ''' 去除重复的的句子(去除标点符号后一样的句子则算一样) 1. 初始化jieba分词,并用分词去除标点符号 2. 去重处理 :param data: :return: ''' jutil = Jieba_Util(verbose=0) # 去除标点符号 remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False) labels = [] sentences = [] for label, group in data.groupby(by=[u'LABEL']): # print(label,len(group),len(group[u'SENTENCE'].unique())) # 去除该类别之后的句子和句子数 # print(group[u'SENTENCE']) # print(group[u'SENTENCE'].apply(remove_sentence_punctuation)) norepet_sentcence_set = set() sentences_after_rm_rep = [] for item in group[u'SENTENCE'].as_matrix(): seged_sentence = remove_sentence_punctuation(item) if seged_sentence not in norepet_sentcence_set: norepet_sentcence_set.add(seged_sentence) sentences_after_rm_rep.append(item) # print(seged_sentence) else: pass # print(item) num_after_rm_rep = len(sentences_after_rm_rep) sentences.extend(sentences_after_rm_rep) labels.extend([label] * num_after_rm_rep) # print(len(labels)) # print(len(sentences)) return pd.DataFrame(data={'LABEL': labels, 'SENTENCE': sentences})
def test2(): input_file1 = './sample_data/v2.3_train_Sa_891.csv' data = pd.read_csv(input_file1, encoding='utf8', sep='\t', index_col=0, header=0) data = data[data['LABEL'] != u'其它#其它'] data = data[data['LABEL'] != u'其它#捣乱'] print(data.head()) # 分词 jieba_util = Jieba_Util() segment_sentence = lambda x: jieba_util.iter_each_word( sentence=x, sep=' ', need_segmented=True, full_mode=False, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix() sentences = data['WORDS'].as_matrix() print '句子数:%d' % sentences.shape # print(sentences[-1]) # quit() util = Word2vecUtil(size=50, train_method='cbow') util.train(sentences) util.print_model_descibe() most_similar_words = util.model.most_similar(u'机') most_similar_words = util.model.most_similar(u'喜') print ','.join([i for i, j in most_similar_words]) util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
class FeatureEncoder(object): """ Onehot特征编码器,将句子转成 onehot编码 函数列表为: 1. segment_sentence:对句子分词 2. build_dictionary:构建字典 3. sentence_to_index:将原始字符串句子转为字典索引列表 4. sentence_padding:将句子补齐 5. fit_transform:构建编码器并转换数据 6. transform_sentence:对句子编码 7. get_sentence_length:对句子长度计算 8. print_sentence_length_detail: 打印训练库句子详情. 9. print_model_descibe: 打印模型的详情. 10. sentence_index_to_bow: 将索引转为onehot数据 11. to_onehot_array: 生成训练库句子的onehot编码 12. reset: clear 数据 注意: 1. onehot编码 有两种形式:通过设置 to_onehot_array 切换 - 字典索引形式表示,补齐 (默认是这种形式) - onehot 向量 0. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。 1. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。 """ __version__ = '1.4' def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs ): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in ['center', 'left', 'right', 'none'], 'padding mode 只能取: center,left,right,none' assert self.feature_type in ['word', 'seg', 'word_seg', 'word_seg_concat'], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # self.fit_transform() def segment_sentence(self, sentence): """ 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str """ if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) elif self.feature_type == 'word_seg_concat': # 先字后词拼接,不去重 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(word + seg) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def get_sentence_length(self, sentence): ''' 计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。 1. 对句子分词 2. 对句子的词计算 :param sentence: 句子 :type sentence: str :return: 句子长度 :rtype: int ''' # 1. 分词 segmented_senence = self.segment_sentence(sentence) # 2. 统计 sentence_length = len(segmented_senence.split()) return sentence_length def print_sentence_length_detail( self, data=None, lengths=[7, 10, 15, 20,50,80,100], ): """ 打印训练库中句子的长度情况 :type lengths: list :param lengths: 长度界限列表 :return: 句子长度列表 :rtype: list """ if self.need_segmented: sentence_length = map(self.get_sentence_length, data) else: sentence_length = map(lambda x: len(x.split()), data) for l in lengths: le_this_len = sum(np.asarray(sentence_length) <= l) / (1.0 * len(sentence_length)) print('句子长度小于等于%d的有:%f' % (l, le_this_len)) print('句子长度情况为:%s' % (str(sentence_length))) print('句子最长长度为:%d' % (max(sentence_length))) print('句子最短长度为:%d' % (min(sentence_length))) print('句子平均长度为:%d' % (np.average(sentence_length))) return sentence_length def get_unkown_vector(self, ndim=50): rand = np.random.RandomState(1337) return rand.uniform(-0.25, 0.25, ndim) def get_w2vEmbedding(self, word): """ 返回词向量 Returns ------- (array,str) """ try: if word == u'PADDING': vector = np.zeros(self.word2vec_model.vector_size) flag = 'PADDING' elif word == u'UNKOWN': # 当训练 vector = self.get_unkown_vector(self.word2vec_model.vector_size) flag = 'NO_IN_MODEL_VOCAB' else: vector = self.word2vec_model[word] flag = 'OK' except: vector = self.get_unkown_vector(self.word2vec_model.vector_size) if self.verbose > 1: print('OOV: %s' % word) flag = 'NO_IN_W2V' return np.asarray(vector), flag def to_embedding_weight(self, path): """ 使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。 1. 加载word2vec模型 2. :param path: word2vec 模型文件路径 :type path: str :return: """ if self.word2vec_model is None: w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(path) size = self.vocabulary_size embedding_weights = np.zeros((size, self.word2vec_model.vector_size)) words_count_no_in_w2v = 0 words_count_no_in_vacab = 0 words_count_in = 0 words_count_paddding = 0 for key, value in self.train_data_dict.token2id.items(): vector, flag = self.get_w2vEmbedding(key) embedding_weights[value, :] = vector if flag == 'NO_IN_W2V': words_count_no_in_w2v += 1 if flag == 'NO_IN_MODEL_VOCAB': words_count_no_in_vacab += 1 if flag == 'OK': words_count_in += 1 # print(key) if flag == 'PADDING': words_count_paddding += 1 if self.verbose > 0: print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v)) print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab)) print('出现在w2v模型中的词有:%d个' % (words_count_in)) # self.embedding_weights = embedding_weights return embedding_weights def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray(map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get(u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get(u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1])] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray( [self.word_similarity(word2vec_model, item, i) for i in self.vocabulary]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 :param word1: :param word2: :return: ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def sentence_to_index(self, sentence): """ 将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN Parameters ---------- sentence: str 以空格分割 """ if self.add_unkown_word: unknow_token_index = self.train_data_dict.token2id[u'UNKOWN'] else: unknow_token_index = 0 # 将训练库中所有句子的每个词映射到索引上,变成索引列表 index = [self.train_data_dict.token2id.get(item, unknow_token_index) for item in sentence.split()] if self.verbose > 1: if index.__contains__(unknow_token_index): print('unknow_token_index:%d' % unknow_token_index) print('出现字典OOV') print(sentence) print(index) # assert not index.__contains__(-1),u'出现OOV词' return index def sentence_padding(self, sentence): ''' 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING :type sentence: str :param sentence: 句子,词之间以 空格 分割 :return: 返回补齐后的句子,以空格分割 :type: str ''' padding_length = self.sentence_padding_length # print(sentence) sentence = sentence.split() sentence_length = len(sentence) # print(sentence_length) if sentence_length > padding_length: # logging.debug(u'对句子进行截断:%s' % (sentence)) sentence = sentence[:padding_length] # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) elif sentence_length < padding_length: should_padding_length = padding_length - sentence_length left_padding = np.asarray(['PADDING'] * (should_padding_length / 2)) right_padding = np.asarray(['PADDING'] * (should_padding_length - len(left_padding))) if self.padding_mode == 'center': sentence = np.concatenate((left_padding, sentence, right_padding), axis=0) elif self.padding_mode == 'left': sentence = np.concatenate((left_padding, right_padding, sentence), axis=0) elif self.padding_mode == 'right': sentence = np.concatenate((sentence, left_padding, right_padding), axis=0) elif self.padding_mode == 'none': sentence = sentence else: raise NotImplemented sentence = ' '.join(sentence) return sentence def sentence_index_to_onehot(self, index): ''' 注意:该方法跟[sentence_index_to_bow]的区别。 将词的索引转成 onehot 编码,比如: 索引 1 -->[ 0 , 0 , 0 , 0, 1] :param index: 一个词的字典索引 :type index: list :return: onehot 编码,shape为 (句子长度,字典长度) :rtype: np.array() ''' onehot_array = [] for item in index: temp = np.zeros(self.vocabulary_size, dtype=int) if item == 0: pass else: temp[item - 1] = 1 onehot_array.append(temp) # onehot_array = np.concatenate(onehot_array,axis=1) onehot_array = np.asarray(onehot_array) return onehot_array def sentence_index_to_bow(self, index): ''' 注意:该方法跟[word_index_to_onehot]的区别。 将句子的字典索引转成 词包向量 编码比如: [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0, 0] :param index: 一个句子的字典索引 :type index: list :return: bow 编码,长度为 字典长度 :rtype: np.array() ''' onehot_array = np.zeros(self.vocabulary_size, dtype=int) onehot_array[index] = 1 return onehot_array def batch_sentence_index_to_onehot_array(self, sentence_indexs): ''' 将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中 :return: onehot编码的数组 ''' self.onehot_array = np.asarray(map(self.sentence_index_to_onehot, sentence_indexs)) return self.onehot_array def fit_transform(self, train_data=None, test_data=None): return self.fit(train_data, test_data).transform(train_data) def fit(self, train_X=None, test_X=None ): """ build feature encoder ---- 构建训练库字典 Notes ------ update_dictionary: 设置 再次调用fit()函数时,是否更新字典,默认为 True,即只在第一次调用fit()函数时才更新 字典 vocabulary_including_test_set: 设置 是否 字典是否包含测试集的词汇,默认包含,即字典包含训练集和测试集的所有词汇。 - 设置为 False ,则 字典只包含训练集中的词汇 Parameters ---------- train_X: array-like 训练句子列表:['','',...,''] test_X: array-like 测试句子列表:['','',...,''] Returns ------- object: 编码后的列表 """ if not self.kwargs.get('update_dictionary', True): # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可 if self.vocabulary is not None: return self logging.debug('=' * 20) if train_X is None: logging.debug('没有输入训练数据!') assert False, '没有输入训练数据!' if self.kwargs.get('vocabulary_including_test_set', True): if test_X is None: logging.debug('vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!') assert False, 'vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!' # region -------------- 1.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.构建训练库字典') print('1.构建训练库字典') # -------------- code start : 开始 ------------- # 构建训练库字典 self.build_dictionary(train_X, test_X) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.构建训练库字典 --------------- return self def transform_sentence(self, sentence): """ 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 分词 2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表 - 当 参数 to_onehot_array = True (默认为 False)时,直接返回 字典索引 ; - 当 参数 to_onehot_array = False (默认为 False)时,进入第3步,进一步转换成 onehot 向量 ; 3. 每个词的字典索引变成onehot向量 - 这一步不一定会执行 - to_onehot_array = True 时, 执行 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like """ assert self.train_data_dict is not None, '请先fit_transform()模型' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word(self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 1. 分词 --------------- # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- paded_sentence = self.sentence_padding(seg_sentence) sentence_index = self.sentence_to_index(paded_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 --------------- # region -------------- 3. 将每个词的字典索引变成onehot向量 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('3. 将每个词的字典索引变成onehot向量') print('3. 将每个词的字典索引变成onehot向量') # -------------- code start : 开始 ------------- if self.to_onehot_array: onehot_array = self.sentence_index_to_onehot(sentence_index) else: onehot_array = sentence_index # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 --------------- return onehot_array def transform(self, X): ''' 批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 直接调用 self.transform_sentence 进行处理 :param sentence: 输入句子 :type sentence: array-like :return: 补齐的字典索引 :rtype: array-like ''' index = map(lambda x: self.transform_sentence(x), X) # print train_index[:5] return np.asarray(index) def reset(self): """ 清理对象中的数据 - self.vocabulary """ self.vocabulary = None def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = {'train_data_count': len(self.train_data), 'need_segmented': self.need_segmented, 'feature_type': self.feature_type, 'verbose': self.verbose, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'sentence_padding_length': self.sentence_padding_length, 'padding_mode': 'center', 'vocabulary_size': self.vocabulary_size, 'padding_token_index': self.padding_token_index, 'unknow_token_index': self.unknow_token_index, 'add_unkown_word': True, 'mask_zero': True, } pprint.pprint(detail) logging.debug(detail) return detail
class DataUtil(object): """ OOD 数据集 - stable version 数据工具类 - 数据的部分设置 在文件 setting.py 中 """ def __init__(self): # 训练数据的根目录 self.dataset_root_path = DATA_ROOT_PATH self.word2vec_model_root_path = WORD2VEC_MODEL_ROOT_PATH self.jieba_util = None def get_label_index(self, version='v2.0'): """ 获取 DA 分类类别的列表,总共有24类 :return: label_to_index,index_to_label """ if version == 'v1.0': # 16分类标签 index_to_label = [ u'捣乱#骂人', u'导购#开始', u'导购#成交', u'导购#更换', u'导购#详情', u'表态#附和', u'表态#否定', u'表态#犹豫', u'表态#肯定', u'表态#否定#不满', u'表态#随便', u'闲聊#身份信息', u'闲聊#天气', u'闲聊#问候', u'闲聊#时间', u'闲聊#结束语', ] elif version == 'v2.0': # 24分类标签 index_to_label = [ u'其它#骂人', u'导购#不成交', u'导购#不理解', u'导购#开始', u'导购#成交', u'导购#更换', u'导购#结束', u'导购#详情', u'表态#不满', u'表态#否定', u'表态#满意', u'表态#犹豫', u'表态#疑问', u'表态#肯定', u'表态#附和', u'表态#随便', u'社交义务#不用谢', u'社交义务#接受道歉', u'社交义务#致谢', u'社交义务#道歉', u'社交义务#问候', u'闲聊#天气', u'闲聊#时间', u'闲聊#身份信息' ] elif version == 'v2.0++': # 24分类标签 ++所用版本,临时用 index_to_label = [ u'其它#骂人', u'导购#不成交', u'导购#不理解', u'导购#开始', u'导购#成交', u'导购#更换', u'导购#结束', u'导购#详情', u'社交义务#不用谢', u'社交义务#接受道歉', u'社交义务#致谢', u'社交义务#道歉', u'社交义务#问候', u'表态#不满', u'表态#否定', u'表态#满意', u'表态#犹豫', u'表态#疑问', u'表态#肯定', u'表态#附和', u'表态#随便', u'闲聊#天气', u'闲聊#时间', u'闲聊#身份信息' ] elif version == 'v2.0_small': # 17分类标签 index_to_label = [ u'其它#骂人', u'导购#开始', u'导购#成交', u'导购#更换', u'导购#结束', u'导购#详情', u'表态#否定', u'表态#不满', u'表态#犹豫', u'表态#肯定', u'表态#附和', u'表态#随便', u'社交义务#不用谢', u'社交义务#问候', u'闲聊#天气', u'闲聊#时间', u'闲聊#身份信息' ] # print('类别数为:%d'%len(index_to_label)) label_to_index = { label: idx for idx, label in enumerate(index_to_label) } return label_to_index, index_to_label def transform_word2vec_model_name(self, flag): """ 根据 flag 转换成完整的 word2vec 模型文件名 :param flag: :return: """ from data_processing_util.word2vec_util.word2vec_util import Word2vecUtil return Word2vecUtil().transform_word2vec_model_name(flag) def transform_dataset_name(self, flag): """ 将数据集标记转为真正的训练集和测试集文件名 :param flag: 数据集标记 v1.0(S),v2.2(S),v2.2(Sa),v2.2(L),v2.3(S) :type flag: str :return: train_data_file_path,test_data_file_path """ if flag == 'v1.0(S)': # 使用v2.2 L版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, '20160526/train_all.csv') test_data_file_path = os.path.join(self.dataset_root_path, '20160526/ood_labeled.csv') elif flag == 'v2.2(L)': # 使用v2.2 L版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_train_L_2302.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_test_L_76.csv') elif flag == 'v2.2(S)': # 使用v2.2 S版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_train_S_1518.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_test_S_131.csv') elif flag == 'v2.2(Sa)': # 使用v2.2 Sa版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_train_Sa_893.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.2/v2.2_test_Sa_79.csv') # else: # 如果匹配不上,则使用v2.2 Sa版本的数据集 # train_data_file_path = self.dataset_root_path , 'v2.2/v2.2_train_L_2302.csv' # test_data_file_path = self.dataset_root_path , 'v2.2/v2.2_test_L_76.csv' elif flag == 'v2.3(L)': # 使用v2.2 L版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_train_L_2300.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_test_L_76.csv') elif flag == 'v2.3(S)': # 使用v2.2 S版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_train_S_1518.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_test_S_131.csv') elif flag == 'v2.3(Sa)': # 使用v2.2 Sa版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_train_Sa_891.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_test_Sa_79.csv') else: # 如果匹配不上,则使用v2.2 Sa版本的数据集 train_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_train_L_2300.csv') test_data_file_path = os.path.join(self.dataset_root_path, 'v2.3/v2.3_test_L_76.csv') return train_data_file_path, test_data_file_path def merge_to_17class(self, data): ''' 将新版数据集合并成17个类别 :param data: :return: ''' data.loc[data['LABEL'] == u'导购#不理解', 'LABEL'] = u'其它#其它' data.loc[data['LABEL'] == u'表态#疑问', 'LABEL'] = u'其它#其它' data.loc[data['LABEL'] == u'表态#满意', 'LABEL'] = u'表态#肯定' data.loc[data['LABEL'] == u'导购#不成交', 'LABEL'] = u'导购#结束' data.loc[data['LABEL'] == u'社交义务#接受道歉', 'LABEL'] = u'导购#结束' data.loc[data['LABEL'] == u'社交义务#致谢', 'LABEL'] = u'导购#结束' data.loc[data['LABEL'] == u'社交义务#道歉', 'LABEL'] = u'导购#结束' # print(','.join(data['LABEL'].unique())) # print(len(data['LABEL'].unique())) # quit() return data def load_train_test_data(self, config): """ 加载训练数据和测试数据,根据配置选择 加载的文件中一定要有 LABEL 和 SENTENCE 字段 :param config: :return: """ logging.debug('=' * 20) train_data_file_path, test_data_file_path = self.transform_dataset_name( config['dataset_type']) # -------------- print start : just print info ------------- if config['verbose'] > 0: logging.debug('加载%s版本数据集的训练数据和测试数据\n标注版本:%s' % (config['dataset_type'], config['label_version'])) print('加载%s版本数据集的训练数据和测试数据\n标注版本:%s' % (config['dataset_type'], config['label_version'])) logging.debug('train_data_file_path:%s' % train_data_file_path) logging.debug('test_data_file_path:%s' % test_data_file_path) print('train_data_file_path:%s' % train_data_file_path) print('test_data_file_path:%s' % test_data_file_path) # -------------- print end : just print info ------------- train_data = pd.read_csv(train_data_file_path, sep='\t', encoding='utf8', header=0) test_data = pd.read_csv(test_data_file_path, sep='\t', encoding='utf8', header=0) if config['label_version'] == 'v2.0_small': train_data = self.merge_to_17class(train_data) test_data = self.merge_to_17class(test_data) if config['verbose'] > 0: logging.debug('fit data shape is :%s' % (str(train_data.shape))) print('fit data shape is :%s' % (str(train_data.shape))) logging.debug('test data shape is :%s' % (str(test_data.shape))) print('test data shape is :%s' % (str(test_data.shape))) logging.debug('-' * 20) # 去除类别 其他#其他 logging.debug('去除类别 其他#其他 ID,其他#捣乱') print('去除类别 其它#其它 ID 其他#捣乱') filter_row = lambda x: x not in [u'其它#其它', u'其他#其他', u'ID', u'其它#捣乱'] train_data['IS_FILTER'] = train_data['LABEL'].apply(filter_row) test_data['IS_FILTER'] = test_data['LABEL'].apply(filter_row) train_data = train_data[train_data['IS_FILTER'] == True] test_data = test_data[test_data['IS_FILTER'] == True] if config['verbose'] > 0: logging.debug('fit data shape is :%s' % (str(train_data.shape))) print('fit data shape is :%s' % (str(train_data.shape))) logging.debug('test data shape is :%s' % (str(test_data.shape))) print('test data shape is :%s' % (str(test_data.shape))) logging.debug('-' * 20) train_data = train_data[['LABEL', 'SENTENCE']] test_data = test_data[['LABEL', 'SENTENCE']] label_to_index, index_to_label = self.get_label_index( version=config['label_version']) if config['verbose'] > 0: logging.debug(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label))) print(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label))) train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index) test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index) return train_data, test_data def batch_segment_sentences(self, sentences): ''' 对多个句子批量分词 :param sentences: array-like :return: ''' self.jieba_util = Jieba_Util() segmented_sentences = map(self.segment_sentence, sentences) return segmented_sentences def segment_sentence(self, sentence): ''' 将句子进行分词 :param sentence: :return: ''' segmented_sentence = self.jieba_util.seg( sentence=sentence, sep=' ', full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) return segmented_sentence def save_data(self, data, path): ''' 保存DataFrame格式的数据 :param data: 数据 :param path: 数据文件的路径 :return: None ''' data.to_csv( path, sep='\t', header=True, index=False, encoding='utf8', ) def save_result(self, data, predict, is_correct, path): ''' 将预测结果进行保存 :param data: 数据,DataFrame :param predict: 预测结果 :type predict: array-like :param is_correct: 是否正确 :param path: 路径 :return: None ''' label_to_index, index_to_label = self.get_label_index() data['PREDICT'] = [index_to_label[item] for item in predict] data['is_correct'] = is_correct self.save_data(data, path) def load_data(self, path): ''' 加载DataFrame格式的数据 :param data: 数据 :param path: 数据文件的路径 :return: None ''' data = pd.read_csv( path, sep='\t', header=0, encoding='utf8', index_col=0, ) return data def get_k_fold_data( self, k=5, data=None, rand_seed=0, ): ''' 将数据分为K-fold :param k: :param data: :type data: pd.DataFrame() :return: ''' train_X = data['SENTENCE'].as_matrix() train_y = data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed): cv_x.append(x) cv_y.append(y) return cv_x, cv_y
class FeatureEncoder(object): ''' ## 简介 BOW特征编码器:基于sklearn的CountVectorizer,TfidfVectorizer实现,将句子转成 BOW(计算)或者TFIDF编码。 ## 目前支持两种粒度的切分: 字(word) 和 分词后的词(seg) 包含以下主要函数: 1. segment_sentence:对句子分词 2. transform_sentence:buildin,对一个句子编码 3. fit_transform:构建编码器并转换数据 4. transform: 转换数据 5. print_sentence_length_detail: todo,打印训练库句子详情. 6. print_model_descibe: 打印模型的详情. ''' def __init__( self, # rand_seed=1337, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=None, word2vec_to_solve_oov=False, save_middle_result=False, **kwargs): ''' 1. 初始化参数,并验证参数合法性 2. build feature encoder :param verbose: 数值越大,输出越详细 :type verbose: int :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法 :type feature_method: str :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机 :type feature_type: str :param max_features: 模型设置选项,特征选择的最大特征词数 :type max_features: int :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭! :type save_middle_result: bool :param kwargs: 支持 word2vec_model_file_path等 :type kwargs: dict ''' # self.rand_seed = rand_seed self.save_middle_result = save_middle_result self.verbose = verbose self.full_mode = full_mode self.remove_stopword = remove_stopword self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.feature_method = feature_method self.feature_type = feature_type self.max_features = max_features self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.feature_method in ['bow', 'tfidf' ], 'feature method 只能取: bow,tfidf' assert self.feature_type in ['word', 'seg', 'word_seg' ], 'feature type 只能取: word,seg和word_seg' if word2vec_to_solve_oov: # 加载word2vec模型 if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path' ), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 特征编码器: bow or tf-idf transformer self.feature_encoder = None # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # 训练样例的个数 self.train_data_count = 0 # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: # 原始训练数据 self.train_data = None # 切完词的句子 self.segmented_sentences = None # 训练句子特征 self.train_features = None # endregion # word2vec 模型 # self.word2vec_model = None # self.fit_transform() def segment_sentence(self, sentence): ''' 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str ''' if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def reset(self): """重置对象 Returns ------- """ self.feature_encoder = None def fit_transform(self, train_data=None, test_data=None): """ build feature encoder 1. fit 2. transform拟合数据 :param train_data: 训练句子列表:['','',...,''] :type train_data: array-like. :return: train_data 编码后的向量 """ # 训练样例的个数 self.train_data_count = len(train_data) return self.fit(train_data, test_data).transform(train_data) def fit(self, train_data=None, test_data=None): """ build feature encoder 1. 转换数据格式,并分词 2. 构建vectorizer :param train_data: 训练句子列表:['','',...,''] :type train_data: array-like. :return: train_data 编码后的向量 """ if self.verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # -------------- region start : 1. 转换数据格式,并分词 ------------- if self.verbose > 2: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 转换数据格式,并分词') print('1. 转换数据格式,并分词') # -------------- code start : 开始 ------------- assert train_data is not None, '没有输入训练数据!' train_data = np.asarray(train_data) # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: self.train_data = train_data if self.need_segmented: # 分词 train_segmented_sentences = map(self.segment_sentence, train_data) else: # 不需要分词 train_segmented_sentences = train_data # -------------- code start : 结束 ------------- if self.verbose > 2: logging.debug('-' * 20) print('-' * 20) # -------------- region end : 1. 转换数据格式,并分词 --------------- if self.feature_encoder is None: # 当 feature_encoder 还没创建过时,则创建 if self.feature_method == 'tfidf': self.feature_encoder = TfidfVectorizer( analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, # vocabulary = tfidf_vocabulary, max_features=self.max_features, ) elif self.feature_method == 'bow': self.feature_encoder = CountVectorizer( analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, # vocabulary = tfidf_vocabulary, max_features=self.max_features, ) else: raise NotImplementedError self.feature_encoder.fit_transform( train_segmented_sentences).toarray() # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) # if self.save_middle_result: # self.train_features = train_features # 字典 self.vocabulary = self.feature_encoder.get_feature_names() # 字典个数 self.vocabulary_size = len(self.vocabulary) return self def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 Parameters ---------- word2vec_model : gensim object word2vec_model gensim Word2Vec model word2: word1: Returns -------- similarity score: float ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray([ self.word_similarity(word2vec_model, item, i) for i in self.vocabulary ]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def transform_sentence( self, sentence, ): ''' 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 bow或tfidf 编码。 1. 分词 2. 编码 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like ''' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word( self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1. 分词 --------------- # region -------------- 2. 特征转换 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- features = self.feature_encoder.transform([seg_sentence]).toarray()[0] # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2. 特征转换 --------------- return features def transform( self, data, ): ''' 批量转换数据,跟 transform_sentence()一样的操作 1. 直接调用 self.transform_sentence 进行处理 :param data: 输入句子集合 :type data: array-like :return: bow vector :rtype: array-like ''' index = map(self.transform_sentence, data) # print(index[:5]) return np.asarray(index) def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = { 'train_data_count': self.train_data_count, 'need_segmented': self.need_segmented, 'word2vec_to_solve_oov': self.word2vec_to_solve_oov, 'vocabulary_size': self.vocabulary_size, 'verbose': self.verbose, # 'rand_seed': self.rand_seed, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'lowercase': self.lowercase, 'zhs2zht': self.zhs2zht, 'remove_url': self.remove_url, 'feature_method': self.feature_method, 'feature_type': self.feature_type, 'max_features': self.max_features, } pprint.pprint(detail) logging.debug(detail) return detail
def __init__(self): jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x:jutil.seg(x,sep='',remove_url=False)
def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in [ 'center', 'left', 'right', 'none' ], 'padding mode 只能取: center,left,right,none' assert self.feature_type in [ 'word', 'seg', 'word_seg', 'word_seg_concat' ], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key( 'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...')
def __init__(self): # 初始化jieba工具 self.jieba_util = Jieba_Util()
def __init__( self, # rand_seed=1337, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=None, word2vec_to_solve_oov=False, save_middle_result=False, **kwargs): ''' 1. 初始化参数,并验证参数合法性 2. build feature encoder :param verbose: 数值越大,输出越详细 :type verbose: int :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法 :type feature_method: str :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机 :type feature_type: str :param max_features: 模型设置选项,特征选择的最大特征词数 :type max_features: int :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭! :type save_middle_result: bool :param kwargs: 支持 word2vec_model_file_path等 :type kwargs: dict ''' # self.rand_seed = rand_seed self.save_middle_result = save_middle_result self.verbose = verbose self.full_mode = full_mode self.remove_stopword = remove_stopword self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.feature_method = feature_method self.feature_type = feature_type self.max_features = max_features self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.feature_method in ['bow', 'tfidf' ], 'feature method 只能取: bow,tfidf' assert self.feature_type in ['word', 'seg', 'word_seg' ], 'feature type 只能取: word,seg和word_seg' if word2vec_to_solve_oov: # 加载word2vec模型 if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path' ), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 特征编码器: bow or tf-idf transformer self.feature_encoder = None # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # 训练样例的个数 self.train_data_count = 0 # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: # 原始训练数据 self.train_data = None # 切完词的句子 self.segmented_sentences = None # 训练句子特征 self.train_features = None
""" Author: 'jdwang' Date: 'create date: 2016-07-16' Email: '*****@*****.**' Describe: """ from __future__ import print_function import numpy as np import pandas as pd import logging import timeit from data_processing_util.jiebanlp.jieba_util import Jieba_Util jutil = Jieba_Util(verbose=0) remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False) # 统计 进入协处理的对话段数 ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv' ch2r_dialogue = pd.read_csv( ch2r_dialogue_file_path, sep='\t', encoding='utf8', header=0, ) user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
class DataUtil(object): def __init__(self): # 训练数据的根目录 self.dataset_root_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/stable_vesion/' self.word2vec_model_root_path = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/' self.jieba_util = None def get_label_index(self,version='v2.0'): """ 获取 DA 分类类别的列表,总共有24类 :return: label_to_index,index_to_label """ if version == 'v1.0': # 16分类标签 index_to_label = [ u'捣乱#骂人', u'导购#开始',u'导购#成交', u'导购#更换',u'导购#详情', u'表态#附和',u'表态#否定', u'表态#犹豫',u'表态#肯定',u'表态#否定#不满',u'表态#随便', u'闲聊#身份信息',u'闲聊#天气', u'闲聊#问候',u'闲聊#时间',u'闲聊#结束语', ] elif version=='v2.0': # 24分类标签 index_to_label = [ u'其它#骂人', u'导购#不成交',u'导购#不理解',u'导购#开始', u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情', u'表态#不满',u'表态#否定',u'表态#满意', u'表态#犹豫',u'表态#疑问',u'表态#肯定',u'表态#附和',u'表态#随便', u'社交义务#不用谢',u'社交义务#接受道歉',u'社交义务#致谢', u'社交义务#道歉',u'社交义务#问候', u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息' ] elif version=='v2.0++': # 24分类标签 ++所用版本,临时用 index_to_label = [ u'其它#骂人', u'导购#不成交',u'导购#不理解',u'导购#开始', u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情', u'社交义务#不用谢',u'社交义务#接受道歉',u'社交义务#致谢', u'社交义务#道歉',u'社交义务#问候', u'表态#不满',u'表态#否定',u'表态#满意', u'表态#犹豫',u'表态#疑问',u'表态#肯定',u'表态#附和',u'表态#随便', u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息' ] elif version=='v2.0_small': # 17分类标签 index_to_label = [ u'其它#骂人', u'导购#开始', u'导购#成交',u'导购#更换',u'导购#结束',u'导购#详情', u'表态#否定',u'表态#不满', u'表态#犹豫',u'表态#肯定',u'表态#附和',u'表态#随便', u'社交义务#不用谢', u'社交义务#问候', u'闲聊#天气',u'闲聊#时间',u'闲聊#身份信息' ] # print('类别数为:%d'%len(index_to_label)) label_to_index = {label: idx for idx, label in enumerate(index_to_label)} return label_to_index,index_to_label def transform_word2vec_model_name(self,flag): ''' 根据 flag 转换成完整的 word2vec 模型文件名 :param flag: :return: ''' from data_processing_util.word2vec_util.word2vec_util import Word2vecUtil return Word2vecUtil().transform_word2vec_model_name(flag) def transform_dataset_name(self,flag): """ 将数据集标记转为真正的训练集和测试集文件名 :param flag: 数据集标记 v1.0(S),v2.2(S),v2.2(Sa),v2.2(L),v2.3(S) :type flag: str :return: train_data_file_path,test_data_file_path """ if flag == 'v1.0(S)': # 使用v2.2 L版本的数据集 train_data_file_path = self.dataset_root_path + '20160526/train_all.csv' test_data_file_path = self.dataset_root_path + '20160526/ood_labeled.csv' elif flag == 'v2.2(L)': # 使用v2.2 L版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_L_2302.csv' test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_L_76.csv' elif flag == 'v2.2(S)': # 使用v2.2 S版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_S_1518.csv' test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_S_131.csv' elif flag == 'v2.2(Sa)': # 使用v2.2 Sa版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_Sa_893.csv' test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_Sa_79.csv' # else: # 如果匹配不上,则使用v2.2 Sa版本的数据集 # train_data_file_path = self.dataset_root_path + 'v2.2/v2.2_train_L_2302.csv' # test_data_file_path = self.dataset_root_path + 'v2.2/v2.2_test_L_76.csv' elif flag == 'v2.3(L)': # 使用v2.2 L版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_L_2300.csv' test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_L_76.csv' elif flag == 'v2.3(S)': # 使用v2.2 S版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_S_1518.csv' test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_S_131.csv' elif flag == 'v2.3(Sa)': # 使用v2.2 Sa版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_Sa_891.csv' test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_Sa_79.csv' else: # 如果匹配不上,则使用v2.2 Sa版本的数据集 train_data_file_path = self.dataset_root_path + 'v2.3/v2.3_train_L_2300.csv' test_data_file_path = self.dataset_root_path + 'v2.3/v2.3_test_L_76.csv' return train_data_file_path,test_data_file_path def merge_to_17class(self,data): ''' 将新版数据集合并成17个类别 :param data: :return: ''' data.loc[data['LABEL']==u'导购#不理解','LABEL'] = u'其它#其它' data.loc[data['LABEL']==u'表态#疑问','LABEL'] = u'其它#其它' data.loc[data['LABEL']==u'表态#满意','LABEL'] = u'表态#肯定' data.loc[data['LABEL']==u'导购#不成交','LABEL'] = u'导购#结束' data.loc[data['LABEL']==u'社交义务#接受道歉','LABEL'] = u'导购#结束' data.loc[data['LABEL']==u'社交义务#致谢','LABEL'] = u'导购#结束' data.loc[data['LABEL']==u'社交义务#道歉','LABEL'] = u'导购#结束' # print(','.join(data['LABEL'].unique())) # print(len(data['LABEL'].unique())) # quit() return data def load_train_test_data(self,config): """ 加载训练数据和测试数据,根据配置选择 加载的文件中一定要有 LABEL 和 SENTENCE 字段 :param config: :return: """ logging.debug('=' * 20) train_data_file_path, test_data_file_path = self.transform_dataset_name(config['dataset_type']) # -------------- print start : just print info ------------- if config['verbose'] > 0 : logging.debug('加载%s版本数据集的训练数据和测试数据\n标注版本:%s'%(config['dataset_type'],config['label_version'])) print('加载%s版本数据集的训练数据和测试数据\n标注版本:%s'%(config['dataset_type'],config['label_version'])) logging.debug('train_data_file_path:%s'%train_data_file_path) logging.debug('test_data_file_path:%s'%test_data_file_path) print('train_data_file_path:%s'%train_data_file_path) print('test_data_file_path:%s'%test_data_file_path) # -------------- print end : just print info ------------- train_data = pd.read_csv( train_data_file_path, sep='\t', encoding='utf8', header=0 ) test_data = pd.read_csv( test_data_file_path, sep='\t', encoding='utf8', header=0 ) if config['label_version']=='v2.0_small': train_data = self.merge_to_17class(train_data) test_data = self.merge_to_17class(test_data) if config['verbose'] > 0: logging.debug('fit data shape is :%s' % (str(train_data.shape))) print('fit data shape is :%s' % (str(train_data.shape))) logging.debug('test data shape is :%s' % (str(test_data.shape))) print('test data shape is :%s' % (str(test_data.shape))) logging.debug('-' * 20) # 去除类别 其他#其他 logging.debug('去除类别 其他#其他 ID,其他#捣乱') print('去除类别 其它#其它 ID 其他#捣乱') filter_row = lambda x: x not in [u'其它#其它', u'其他#其他', u'ID',u'其它#捣乱'] train_data['IS_FILTER'] = train_data['LABEL'].apply(filter_row) test_data['IS_FILTER'] = test_data['LABEL'].apply(filter_row) train_data = train_data[train_data['IS_FILTER'] == True] test_data = test_data[test_data['IS_FILTER'] == True] if config['verbose'] > 0: logging.debug('fit data shape is :%s' % (str(train_data.shape))) print('fit data shape is :%s' % (str(train_data.shape))) logging.debug('test data shape is :%s' % (str(test_data.shape))) print('test data shape is :%s' % (str(test_data.shape))) logging.debug('-' * 20) train_data = train_data[['LABEL', 'SENTENCE']] test_data = test_data[['LABEL', 'SENTENCE']] label_to_index,index_to_label =self.get_label_index(version=config['label_version']) if config['verbose'] > 0: logging.debug(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label))) print(u'总共类别数:%d,分别为:%s' % (len(index_to_label), ','.join(index_to_label))) train_data['LABEL_INDEX'] = train_data['LABEL'].map(label_to_index) test_data['LABEL_INDEX'] = test_data['LABEL'].map(label_to_index) return train_data,test_data def batch_segment_sentences(self,sentences): ''' 对多个句子批量分词 :param sentences: array-like :return: ''' self.jieba_util = Jieba_Util() segmented_sentences = map(self.segment_sentence,sentences) return segmented_sentences def segment_sentence(self,sentence): ''' 将句子进行分词 :param sentence: :return: ''' segmented_sentence = self.jieba_util.seg(sentence=sentence, sep=' ', full_mode=True, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) return segmented_sentence def save_data(self,data,path): ''' 保存DataFrame格式的数据 :param data: 数据 :param path: 数据文件的路径 :return: None ''' data.to_csv(path, sep='\t', header=True, index=False, encoding='utf8', ) def save_result(self,data,predict,is_correct,path): ''' 将预测结果进行保存 :param data: 数据,DataFrame :param predict: 预测结果 :type predict: array-like :param is_correct: 是否正确 :param path: 路径 :return: None ''' label_to_index, index_to_label = self.get_label_index() data['PREDICT'] = [index_to_label[item] for item in predict] data['is_correct'] = is_correct self.save_data(data,path) def load_data(self,path): ''' 加载DataFrame格式的数据 :param data: 数据 :param path: 数据文件的路径 :return: None ''' data = pd.read_csv(path, sep='\t', header=0, encoding='utf8', index_col=0, ) return data def get_k_fold_data(self, k=5, data=None, rand_seed = 0, ): ''' 将数据分为K-fold :param k: :param data: :type data: pd.DataFrame() :return: ''' train_X = data['SENTENCE'].as_matrix() train_y = data['LABEL_INDEX'].as_matrix() cv_x = [] cv_y = [] for x, y in data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=rand_seed): cv_x.append(x) cv_y.append(y) return cv_x,cv_y
class DataUtil(object): ''' 微博立场分析数据处理工具类,包含以下函数: 1. load_data:加载csv格式的数据 2. save_data:保存csv格式的数据 3. print_data_detail: 打印数据详情 4. processing_na_value:处理空值数据 5. segment_sentence:分词 6. split_train_test:切分训练集和测试集 7. ''' def __init__(self): # 初始化jieba工具 self.jieba_util = Jieba_Util() def load_data(self,path,header=True): ''' 读取数据 :param path: 数据文件的路径 :return: ''' if header: data = pd.read_csv(path, sep='\t', header=0, encoding='utf8', ) else: data = pd.read_csv(path, sep='\t', header=None, encoding='utf8', ) return data def load_train_test_data(self, config = None ): ''' 加载训练数据和测试数据,已经标签索引 :param config: 一些配置信息 :param config: dict :return: ''' # -------------- region start : 1. 加载训练集和测试集 ------------- if config['verbose'] > 2: logging.debug('-' * 20) print '-' * 20 logging.debug('1. 加载训练集和测试集') print '1. 加载训练集和测试集' # -------------- code start : 开始 ------------- train_data_file_path = (config['train_data_file_path']) % config['train_data_type'] test_data_file_path = (config['test_data_file_path']) % config['test_data_type'] logging.debug(train_data_file_path) print train_data_file_path logging.debug(test_data_file_path) print test_data_file_path data_util = DataUtil() train_data = data_util.load_data(train_data_file_path) test_data = data_util.load_data(test_data_file_path) # -------------- code start : 结束 ------------- if config['verbose'] > 2: logging.debug('-' * 20) print '-' * 20 # -------------- region end : 1. 加载训练集和测试集 --------------- # 生成类别索引 label_to_index = {u'FAVOR': 0, u'AGAINST': 1, u'NONE': 2} index_to_label = [u'FAVOR', u'AGAINST', u'NONE'] return train_data,test_data,label_to_index,index_to_label def save_data(self,data,path): ''' 保存数据 :param path: 数据文件的路径 :return: ''' data.to_csv(path, sep='\t', header=True, index=False, encoding='utf8', ) def print_data_detail(self, data, has_stance=True): ''' 展示数据的详细信息 :param data: Dateframe对象 :param has_stance: 是否有STANCE字段 :return: 无 ''' logging.debug('data的个数为:%d' % (len(data))) logging.debug('data的sample数据:') logging.debug(data.head()) logging.debug('data的target和个数分别为:') logging.debug(data['TARGET'].value_counts()) if has_stance: logging.debug('统计每个Target下各个类型立场的数量...') group = data.groupby(by=['TARGET', 'STANCE']) logging.debug(group.count()) else: logging.debug('没有STANCE字段') logging.debug('数据各个字段情况...') # print data.info() for column in data.columns: # 统计每个字段是否有数据是空串 # 先将所有空字符串用nan替换 data[column] = data[column].replace(r'^\s*$', np.nan, regex=True) count_null = sum(data[column].isnull()) if count_null != 0: logging.warn(u'%s字段有空值,个数:%d,建议使用processing_na_value()方法进一步处理!' % (column, count_null)) null_data_path = './result/null_data.csv' logging.warn(u'将缺失值数据输出到文件:%s' % (null_data_path)) data[data[column].isnull()].to_csv(null_data_path, index=None, encoding='utf8', sep='\t') def processing_na_value(self,data,clear_na=True,fill_na = False,fill_char = 'NULL',columns=None): ''' 处理数据的空值 :param data: Dateframe对象 :param clear_na: bool,是否去掉空值数据 :param fill_na: bool,是否填充空值 :param fill_char: str,填充空置的字符 :param column: list,需要处理的字段,默认为None时,对所有字段处理 :return: Dateframe对象 ''' logging.debug('[def processing_na_value()] 对缺失值进行处理....') for column in data.columns: if columns == None or column in columns: data[column] = data[column].replace(r'^\s*$', np.nan, regex=True) count_null = sum(data[column].isnull()) if count_null != 0: logging.warn(u'%s字段有空值,个数:%d' % (column, count_null)) if clear_na: logging.warn(u'对数据的%s字段空值进行摘除'%(column)) data = data[data[column].notnull()].copy() else: if fill_na: logging.warn(u'对数据的%s字段空值进行填充,填充字符为:%s'%(column,fill_char)) data[column] = data[column].fillna(value=fill_char) return data def segment_sentence(self,sentence): segmented_sentence = self.jieba_util.seg(sentence=sentence, sep=' ', full_mode=True, remove_stopword=True, replace_number=True, lowercase = True, zhs2zht=True, remove_url=True, ) return segmented_sentence def split_train_test(self,data, train_split=0.7): ''' 将数据切分成训练集和验证集 :param data: :param train_split: float,取值范围[0,1],设置训练集的比例 :return: dev_data,test_data ''' logging.debug('对数据随机切分成train和test数据集,比例为:%f' % (train_split)) num_train = len(data) num_dev = int(num_train * train_split) num_test = num_train - num_dev logging.debug('全部数据、训练数据和测试数据的个数分别为:%d,%d,%d' % (num_train, num_dev, num_test)) rand_list = np.random.RandomState(0).permutation(num_train) # print rand_list # print rand_list[:num_dev] # print rand_list[num_dev:] dev_data = data.iloc[rand_list[:num_dev]].sort_index() test_data = data.iloc[rand_list[num_dev:]].sort_index() # print dev_data # print test_data return dev_data, test_data def count_word_freq(self,data): ''' 统计每个词 在各个类别中的次数,每个词有四个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) :param data: :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder(train_data=data['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # print feature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature',train_X_features) print train_X_features.shape print train_X_features[:5] vocabulary = feature_encoder.vocabulary print ','.join(vocabulary) print feature_encoder.vocabulary_size np.save('result/vocabulary',vocabulary) freq = np.sum(train_X_features,axis=0) favor_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'FAVOR'],axis=0) against_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'AGAINST'],axis=0) none_freq = np.sum(train_X_features[data['STANCE'].as_matrix()==u'NONE'],axis=0) support = np.nan_to_num([max(favor,against,none)/(1.0*(favor+against+none)) for favor,against,none in zip(favor_freq,against_freq,none_freq)]) print freq print favor_freq print against_freq print none_freq count_data = pd.DataFrame(data={ u'WORD':vocabulary, u'FAVOR':favor_freq, u'AGAINST':against_freq, u'NONE':none_freq, u'SUPPORT':support, u'FREQ':freq, }) count_data = count_data.sort_values(by=[u'SUPPORT',u'FREQ','WORD'],ascending=False) count_data = count_data[[u'WORD',u'FAVOR',u'AGAINST',u'NONE',u'FREQ',u'SUPPORT']] count_data.to_csv('result/word_count.csv', sep='\t', index=False, header=True, encoding='utf8', ) print count_data.head()
def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs ): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in ['center', 'left', 'right', 'none'], 'padding mode 只能取: center,left,right,none' assert self.feature_type in ['word', 'seg', 'word_seg', 'word_seg_concat'], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...')
def __init__(self): jutil = Jieba_Util(verbose=0) self.remove_sentence_punctuation = lambda x: jutil.seg( x, sep='', remove_url=False)
class DataUtil(object): ''' 微博立场分析数据处理工具类,包含以下函数: 1. load_data:加载csv格式的数据 2. save_data:保存csv格式的数据 3. print_data_detail: 打印数据详情 4. processing_na_value:处理空值数据 5. segment_sentence:分词 6. split_train_test:切分训练集和测试集 7. ''' def __init__(self): # 初始化jieba工具 self.jieba_util = Jieba_Util() def load_data(self, path, header=True): ''' 读取数据 :param path: 数据文件的路径 :return: ''' if header: data = pd.read_csv( path, sep='\t', header=0, encoding='utf8', ) else: data = pd.read_csv( path, sep='\t', header=None, encoding='utf8', ) return data def load_train_test_data(self, config=None): ''' 加载训练数据和测试数据,已经标签索引 :param config: 一些配置信息 :param config: dict :return: ''' # -------------- region start : 1. 加载训练集和测试集 ------------- if config['verbose'] > 2: logging.debug('-' * 20) print '-' * 20 logging.debug('1. 加载训练集和测试集') print '1. 加载训练集和测试集' # -------------- code start : 开始 ------------- train_data_file_path = ( config['train_data_file_path']) % config['train_data_type'] test_data_file_path = ( config['test_data_file_path']) % config['test_data_type'] logging.debug(train_data_file_path) print train_data_file_path logging.debug(test_data_file_path) print test_data_file_path data_util = DataUtil() train_data = data_util.load_data(train_data_file_path) test_data = data_util.load_data(test_data_file_path) # -------------- code start : 结束 ------------- if config['verbose'] > 2: logging.debug('-' * 20) print '-' * 20 # -------------- region end : 1. 加载训练集和测试集 --------------- # 生成类别索引 label_to_index = {u'FAVOR': 0, u'AGAINST': 1, u'NONE': 2} index_to_label = [u'FAVOR', u'AGAINST', u'NONE'] return train_data, test_data, label_to_index, index_to_label def save_data(self, data, path): ''' 保存数据 :param path: 数据文件的路径 :return: ''' data.to_csv( path, sep='\t', header=True, index=False, encoding='utf8', ) def print_data_detail(self, data, has_stance=True): ''' 展示数据的详细信息 :param data: Dateframe对象 :param has_stance: 是否有STANCE字段 :return: 无 ''' logging.debug('data的个数为:%d' % (len(data))) logging.debug('data的sample数据:') logging.debug(data.head()) logging.debug('data的target和个数分别为:') logging.debug(data['TARGET'].value_counts()) if has_stance: logging.debug('统计每个Target下各个类型立场的数量...') group = data.groupby(by=['TARGET', 'STANCE']) logging.debug(group.count()) else: logging.debug('没有STANCE字段') logging.debug('数据各个字段情况...') # print data.info() for column in data.columns: # 统计每个字段是否有数据是空串 # 先将所有空字符串用nan替换 data[column] = data[column].replace(r'^\s*$', np.nan, regex=True) count_null = sum(data[column].isnull()) if count_null != 0: logging.warn( u'%s字段有空值,个数:%d,建议使用processing_na_value()方法进一步处理!' % (column, count_null)) null_data_path = './result/null_data.csv' logging.warn(u'将缺失值数据输出到文件:%s' % (null_data_path)) data[data[column].isnull()].to_csv(null_data_path, index=None, encoding='utf8', sep='\t') def processing_na_value(self, data, clear_na=True, fill_na=False, fill_char='NULL', columns=None): ''' 处理数据的空值 :param data: Dateframe对象 :param clear_na: bool,是否去掉空值数据 :param fill_na: bool,是否填充空值 :param fill_char: str,填充空置的字符 :param column: list,需要处理的字段,默认为None时,对所有字段处理 :return: Dateframe对象 ''' logging.debug('[def processing_na_value()] 对缺失值进行处理....') for column in data.columns: if columns == None or column in columns: data[column] = data[column].replace(r'^\s*$', np.nan, regex=True) count_null = sum(data[column].isnull()) if count_null != 0: logging.warn(u'%s字段有空值,个数:%d' % (column, count_null)) if clear_na: logging.warn(u'对数据的%s字段空值进行摘除' % (column)) data = data[data[column].notnull()].copy() else: if fill_na: logging.warn(u'对数据的%s字段空值进行填充,填充字符为:%s' % (column, fill_char)) data[column] = data[column].fillna(value=fill_char) return data def segment_sentence(self, sentence): segmented_sentence = self.jieba_util.seg( sentence=sentence, sep=' ', full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) return segmented_sentence def split_train_test(self, data, train_split=0.7): ''' 将数据切分成训练集和验证集 :param data: :param train_split: float,取值范围[0,1],设置训练集的比例 :return: dev_data,test_data ''' logging.debug('对数据随机切分成train和test数据集,比例为:%f' % (train_split)) num_train = len(data) num_dev = int(num_train * train_split) num_test = num_train - num_dev logging.debug('全部数据、训练数据和测试数据的个数分别为:%d,%d,%d' % (num_train, num_dev, num_test)) rand_list = np.random.RandomState(0).permutation(num_train) # print rand_list # print rand_list[:num_dev] # print rand_list[num_dev:] dev_data = data.iloc[rand_list[:num_dev]].sort_index() test_data = data.iloc[rand_list[num_dev:]].sort_index() # print dev_data # print test_data return dev_data, test_data def count_word_freq(self, data): ''' 统计每个词 在各个类别中的次数,每个词有四个统计项: 1. FAVOR: 在favor类别中的出现的次数 2. AGAINST:在AGAINST类别中的出现的次数 3. NONE : 在NONE类别中的出现的次数 4. FREQ : 在所有类别中的出现的次数,即FAVOR+AGAINST+NONE 5. SUPPORT: 最高词频词频项/(FREQ) :param data: :return: ''' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( train_data=data['WORDS'].as_matrix(), verbose=0, padding_mode='none', need_segmented=False, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, remove_url=True, sentence_padding_length=7, add_unkown_word=False, mask_zero=False, zhs2zht=True, ) # print feature_encoder.train_padding_index train_X_features = feature_encoder.to_onehot_array() np.save('result/train_X_feature', train_X_features) print train_X_features.shape print train_X_features[:5] vocabulary = feature_encoder.vocabulary print ','.join(vocabulary) print feature_encoder.vocabulary_size np.save('result/vocabulary', vocabulary) freq = np.sum(train_X_features, axis=0) favor_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'FAVOR'], axis=0) against_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'AGAINST'], axis=0) none_freq = np.sum( train_X_features[data['STANCE'].as_matrix() == u'NONE'], axis=0) support = np.nan_to_num([ max(favor, against, none) / (1.0 * (favor + against + none)) for favor, against, none in zip(favor_freq, against_freq, none_freq) ]) print freq print favor_freq print against_freq print none_freq count_data = pd.DataFrame( data={ u'WORD': vocabulary, u'FAVOR': favor_freq, u'AGAINST': against_freq, u'NONE': none_freq, u'SUPPORT': support, u'FREQ': freq, }) count_data = count_data.sort_values(by=[u'SUPPORT', u'FREQ', 'WORD'], ascending=False) count_data = count_data[[ u'WORD', u'FAVOR', u'AGAINST', u'NONE', u'FREQ', u'SUPPORT' ]] count_data.to_csv( 'result/word_count.csv', sep='\t', index=False, header=True, encoding='utf8', ) print count_data.head()
class FeatureEncoder(object): ''' Onehot特征编码器,将句子转成onehot编码(以字典索引形式表示,补齐),包含以下函数: 1. segment_sentence:对句子分词 2. build_dictionary:构建字典 3. sentence_to_index:将原始字符串句子转为字典索引列表 4. sentence_padding:将句子补齐 5. fit_transform:构建编码器并转换数据 6. transform_sentence:对句子编码 7. get_sentence_length:对句子长度计算 8. print_sentence_length_detail: 打印训练库句子详情. 9. print_model_descibe: 打印模型的详情. 10. sentence_index_to_bow: 将索引转为onehot数据 11. to_onehot_array: 生成训练库句子的onehot编码 12. reset: clear 数据 注意: 1. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。 2. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。 ''' def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in [ 'center', 'left', 'right', 'none' ], 'padding mode 只能取: center,left,right,none' assert self.feature_type in [ 'word', 'seg', 'word_seg', 'word_seg_concat' ], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key( 'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # self.fit_transform() def segment_sentence(self, sentence): ''' 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str ''' if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) elif self.feature_type == 'word_seg_concat': # 先字后词拼接,不去重 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(word + seg) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def get_sentence_length(self, sentence): ''' 计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。 1. 对句子分词 2. 对句子的词计算 :param sentence: 句子 :type sentence: str :return: 句子长度 :rtype: int ''' # 1. 分词 segmented_senence = self.segment_sentence(sentence) # 2. 统计 sentence_length = len(segmented_senence.split()) return sentence_length def print_sentence_length_detail( self, data=None, lengths=[7, 10, 15, 20], ): ''' 打印训练库中句子的长度情况 :type lengths: list :param lengths: 长度界限列表 :return: 句子长度列表 :rtype: list ''' if self.need_segmented: sentence_length = map(self.get_sentence_length, data) else: sentence_length = map(lambda x: len(x.split()), data) for l in lengths: le_this_len = sum(np.asarray(sentence_length) <= l) / ( 1.0 * len(sentence_length)) print('句子长度小于等于%d的有:%f' % (l, le_this_len)) print('句子长度情况为:%s' % (str(sentence_length))) print('句子最长长度为:%d' % (max(sentence_length))) print('句子最短长度为:%d' % (min(sentence_length))) print('句子平均长度为:%d' % (np.average(sentence_length))) return sentence_length def get_unkown_vector(self, ndim=50): rand = np.random.RandomState(1337) return rand.uniform(-0.25, 0.25, ndim) def get_w2vEmbedding(self, word): """ 返回词向量 Returns ------- (array,str) """ try: if word == u'PADDING': vector = np.zeros(self.word2vec_model.vector_size) flag = 'PADDING' elif word == u'UNKOWN': # 当训练 vector = self.get_unkown_vector( self.word2vec_model.vector_size) flag = 'NO_IN_MODEL_VOCAB' else: vector = self.word2vec_model[word] flag = 'OK' except: vector = self.get_unkown_vector(self.word2vec_model.vector_size) if self.verbose > 1: print('OOV: %s' % word) flag = 'NO_IN_W2V' return np.asarray(vector), flag def to_embedding_weight(self, path): """ 使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。 1. 加载word2vec模型 2. :param path: word2vec 模型文件路径 :type path: str :return: """ if self.word2vec_model is None: w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(path) size = self.vocabulary_size embedding_weights = np.zeros((size, self.word2vec_model.vector_size)) words_count_no_in_w2v = 0 words_count_no_in_vacab = 0 words_count_in = 0 words_count_paddding = 0 for key, value in self.train_data_dict.token2id.items(): vector, flag = self.get_w2vEmbedding(key) embedding_weights[value, :] = vector if flag == 'NO_IN_W2V': words_count_no_in_w2v += 1 if flag == 'NO_IN_MODEL_VOCAB': words_count_no_in_vacab += 1 if flag == 'OK': words_count_in += 1 # print(key) if flag == 'PADDING': words_count_paddding += 1 if self.verbose > 0: print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v)) print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab)) print('出现在w2v模型中的词有:%d个' % (words_count_in)) # self.embedding_weights = embedding_weights return embedding_weights def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray( map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get( u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get( u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [ token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1]) ] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray([ self.word_similarity(word2vec_model, item, i) for i in self.vocabulary ]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 :param word1: :param word2: :return: ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def sentence_to_index(self, sentence): """ 将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN Parameters ---------- sentence: str 以空格分割 """ if self.add_unkown_word: unknow_token_index = self.train_data_dict.token2id[u'UNKOWN'] else: unknow_token_index = 0 # 将训练库中所有句子的每个词映射到索引上,变成索引列表 index = [ self.train_data_dict.token2id.get(item, unknow_token_index) for item in sentence.split() ] if self.verbose > 0: if index.__contains__(unknow_token_index): print('出现字典OOV') print(sentence) print(index) # assert not index.__contains__(-1),u'出现OOV词' return index def sentence_padding(self, sentence): ''' 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING :type sentence: str :param sentence: 句子,词之间以 空格 分割 :return: 返回补齐后的句子,以空格分割 :type: str ''' padding_length = self.sentence_padding_length # print(sentence) sentence = sentence.split() sentence_length = len(sentence) # print(sentence_length) if sentence_length > padding_length: # logging.debug(u'对句子进行截断:%s' % (sentence)) sentence = sentence[:padding_length] # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) elif sentence_length < padding_length: should_padding_length = padding_length - sentence_length left_padding = np.asarray(['PADDING'] * (should_padding_length / 2)) right_padding = np.asarray( ['PADDING'] * (should_padding_length - len(left_padding))) if self.padding_mode == 'center': sentence = np.concatenate( (left_padding, sentence, right_padding), axis=0) elif self.padding_mode == 'left': sentence = np.concatenate( (left_padding, right_padding, sentence), axis=0) elif self.padding_mode == 'right': sentence = np.concatenate( (sentence, left_padding, right_padding), axis=0) elif self.padding_mode == 'none': sentence = sentence else: raise NotImplemented sentence = ' '.join(sentence) return sentence def sentence_index_to_onehot(self, index): ''' 注意:该方法跟[sentence_index_to_bow]的区别。 将词的索引转成 onehot 编码,比如: 索引 1 -->[ 0 , 0 , 0 , 0, 1] :param index: 一个词的字典索引 :type index: list :return: onehot 编码,shape为 (句子长度,字典长度) :rtype: np.array() ''' onehot_array = [] for item in index: temp = np.zeros(self.vocabulary_size, dtype=int) if item == 0: pass else: temp[item - 1] = 1 onehot_array.append(temp) # onehot_array = np.concatenate(onehot_array,axis=1) onehot_array = np.asarray(onehot_array) return onehot_array def sentence_index_to_bow(self, index): ''' 注意:该方法跟[word_index_to_onehot]的区别。 将句子的字典索引转成 词包向量 编码比如: [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0, 0] :param index: 一个句子的字典索引 :type index: list :return: bow 编码,长度为 字典长度 :rtype: np.array() ''' onehot_array = np.zeros(self.vocabulary_size, dtype=int) onehot_array[index] = 1 return onehot_array def batch_sentence_index_to_onehot_array(self, sentence_indexs): ''' 将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中 :return: onehot编码的数组 ''' self.onehot_array = np.asarray( map(self.sentence_index_to_onehot, sentence_indexs)) return self.onehot_array def fit_transform(self, train_data=None, test_data=None): return self.fit(train_data, test_data).transform(train_data) def fit(self, train_X=None, test_X=None): """ build feature encoder 1. 构建训练库字典 2. 分词,并将句子补齐到等长,补齐长度为: self.sentence_padding_length 3. 将训练句子转成字典索引形式 4. 将每个词的字典索引变成onehot向量 Parameters ---------- train_X: array-like 训练句子列表:['','',...,''] test_X: array-like 测试句子列表:['','',...,''] Returns ------- object: 编码后的列表 """ if not self.kwargs.get('update_dictionary', True): # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可 if self.vocabulary is not None: return self logging.debug('=' * 20) if train_X is None: logging.debug('没有输入训练数据!') assert False, '没有输入训练数据!' if test_X is None: logging.debug('构建字典需要全部数据,请输入测试数据!') assert False, '构建字典需要全部数据,请输入测试数据!' # region -------------- 1.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.构建训练库字典') print('1.构建训练库字典') # -------------- code start : 开始 ------------- # 构建训练库字典 self.build_dictionary(train_X, test_X) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.构建训练库字典 --------------- return self def transform_sentence(self, sentence): """ 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 分词 2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表 3. 每个词的字典索引变成onehot向量 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like """ assert self.train_data_dict is not None, '请先fit_transform()模型' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word( self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 1. 分词 --------------- # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- paded_sentence = self.sentence_padding(seg_sentence) sentence_index = self.sentence_to_index(paded_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 --------------- # region -------------- 3. 将每个词的字典索引变成onehot向量 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('3. 将每个词的字典索引变成onehot向量') print('3. 将每个词的字典索引变成onehot向量') # -------------- code start : 开始 ------------- if self.to_onehot_array: onehot_array = self.sentence_index_to_onehot(sentence_index) else: onehot_array = sentence_index # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 --------------- return onehot_array def transform(self, X): ''' 批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 直接调用 self.transform_sentence 进行处理 :param sentence: 输入句子 :type sentence: array-like :return: 补齐的字典索引 :rtype: array-like ''' index = map(lambda x: self.transform_sentence(x), X) # print train_index[:5] return np.asarray(index) def reset(self): """ 清理对象中的数据 - self.vocabulary """ self.vocabulary = None def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = { 'train_data_count': len(self.train_data), 'need_segmented': self.need_segmented, 'feature_type': self.feature_type, 'verbose': self.verbose, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'sentence_padding_length': self.sentence_padding_length, 'padding_mode': 'center', 'vocabulary_size': self.vocabulary_size, 'padding_token_index': self.padding_token_index, 'unknow_token_index': self.unknow_token_index, 'add_unkown_word': True, 'mask_zero': True, } pprint.pprint(detail) logging.debug(detail) return detail
def __init__(self): # 初始化jieba工具 self.jieba_util = Jieba_Util()
Author: 'jdwang' Date: 'create date: 2016-07-16' Email: '*****@*****.**' Describe: """ from __future__ import print_function import numpy as np import pandas as pd import logging import timeit from data_processing_util.jiebanlp.jieba_util import Jieba_Util jutil = Jieba_Util(verbose=0) remove_sentence_punctuation = lambda x: jutil.seg(x, sep='', remove_url=False) # 统计 进入协处理的对话段数 ch2r_dialogue_file_path = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/ood_dataset/dev_vesion/ch2r_test_dataset/start-20150613测试集/data/dialogue_usersentence_ge_1.csv' ch2r_dialogue = pd.read_csv( ch2r_dialogue_file_path, sep='\t', encoding='utf8', header= 0, ) user_sentence = ch2r_dialogue[ch2r_dialogue['Name'] != 'Ch2R']
# print ','.join(train_data['LABEL'].unique()) # print test_data['LABEL_INDEX'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() # ------------------------------------------------------------------------------ # -------------- region end : 加载训练数据和测试数据 ------------- # ------------------------------------------------------------------------------ train_data = train_data[['LABEL','SENTENCE']] test_data = test_data[['LABEL','SENTENCE']] logging.debug('=' * 20) logging.debug('对数据进行分词...') logging.debug('-' * 20) jutil = Jieba_Util() if config['feature_type'] == 'word': sentence_to_seg = lambda x: jutil.iter_each_word( sentence=x, need_segmented=True, sep=' ', full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) else: sentence_to_seg = lambda x: jutil.seg( sentence=x,