def test2(): input_file1 = './sample_data/v2.3_train_Sa_891.csv' data = pd.read_csv(input_file1, encoding='utf8', sep='\t', index_col=0, header=0) data = data[data['LABEL'] != u'其它#其它'] data = data[data['LABEL'] != u'其它#捣乱'] print(data.head()) # 分词 jieba_util = Jieba_Util() segment_sentence = lambda x: jieba_util.iter_each_word( sentence=x, sep=' ', need_segmented=True, full_mode=False, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix() sentences = data['WORDS'].as_matrix() print '句子数:%d' % sentences.shape # print(sentences[-1]) # quit() util = Word2vecUtil(size=50, train_method='cbow' ) util.train(sentences) util.print_model_descibe() most_similar_words = util.model.most_similar(u'机') most_similar_words = util.model.most_similar(u'喜') print ','.join([i for i, j in most_similar_words]) util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
def test2(): input_file1 = './sample_data/v2.3_train_Sa_891.csv' data = pd.read_csv(input_file1, encoding='utf8', sep='\t', index_col=0, header=0) data = data[data['LABEL'] != u'其它#其它'] data = data[data['LABEL'] != u'其它#捣乱'] print(data.head()) # 分词 jieba_util = Jieba_Util() segment_sentence = lambda x: jieba_util.iter_each_word( sentence=x, sep=' ', need_segmented=True, full_mode=False, remove_stopword=False, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) data['WORDS'] = data['SENTENCE'].apply(segment_sentence).as_matrix() sentences = data['WORDS'].as_matrix() print '句子数:%d' % sentences.shape # print(sentences[-1]) # quit() util = Word2vecUtil(size=50, train_method='cbow') util.train(sentences) util.print_model_descibe() most_similar_words = util.model.most_similar(u'机') most_similar_words = util.model.most_similar(u'喜') print ','.join([i for i, j in most_similar_words]) util.save('vector/v2.3_train_Sa_891_word_50dim.gem')
class FeatureEncoder(object): ''' Onehot特征编码器,将句子转成onehot编码(以字典索引形式表示,补齐),包含以下函数: 1. segment_sentence:对句子分词 2. build_dictionary:构建字典 3. sentence_to_index:将原始字符串句子转为字典索引列表 4. sentence_padding:将句子补齐 5. fit_transform:构建编码器并转换数据 6. transform_sentence:对句子编码 7. get_sentence_length:对句子长度计算 8. print_sentence_length_detail: 打印训练库句子详情. 9. print_model_descibe: 打印模型的详情. 10. sentence_index_to_bow: 将索引转为onehot数据 11. to_onehot_array: 生成训练库句子的onehot编码 12. reset: clear 数据 注意: 1. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。 2. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。 ''' def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in [ 'center', 'left', 'right', 'none' ], 'padding mode 只能取: center,left,right,none' assert self.feature_type in [ 'word', 'seg', 'word_seg', 'word_seg_concat' ], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key( 'word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # self.fit_transform() def segment_sentence(self, sentence): ''' 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str ''' if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) elif self.feature_type == 'word_seg_concat': # 先字后词拼接,不去重 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(word + seg) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def get_sentence_length(self, sentence): ''' 计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。 1. 对句子分词 2. 对句子的词计算 :param sentence: 句子 :type sentence: str :return: 句子长度 :rtype: int ''' # 1. 分词 segmented_senence = self.segment_sentence(sentence) # 2. 统计 sentence_length = len(segmented_senence.split()) return sentence_length def print_sentence_length_detail( self, data=None, lengths=[7, 10, 15, 20], ): ''' 打印训练库中句子的长度情况 :type lengths: list :param lengths: 长度界限列表 :return: 句子长度列表 :rtype: list ''' if self.need_segmented: sentence_length = map(self.get_sentence_length, data) else: sentence_length = map(lambda x: len(x.split()), data) for l in lengths: le_this_len = sum(np.asarray(sentence_length) <= l) / ( 1.0 * len(sentence_length)) print('句子长度小于等于%d的有:%f' % (l, le_this_len)) print('句子长度情况为:%s' % (str(sentence_length))) print('句子最长长度为:%d' % (max(sentence_length))) print('句子最短长度为:%d' % (min(sentence_length))) print('句子平均长度为:%d' % (np.average(sentence_length))) return sentence_length def get_unkown_vector(self, ndim=50): rand = np.random.RandomState(1337) return rand.uniform(-0.25, 0.25, ndim) def get_w2vEmbedding(self, word): """ 返回词向量 Returns ------- (array,str) """ try: if word == u'PADDING': vector = np.zeros(self.word2vec_model.vector_size) flag = 'PADDING' elif word == u'UNKOWN': # 当训练 vector = self.get_unkown_vector( self.word2vec_model.vector_size) flag = 'NO_IN_MODEL_VOCAB' else: vector = self.word2vec_model[word] flag = 'OK' except: vector = self.get_unkown_vector(self.word2vec_model.vector_size) if self.verbose > 1: print('OOV: %s' % word) flag = 'NO_IN_W2V' return np.asarray(vector), flag def to_embedding_weight(self, path): """ 使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。 1. 加载word2vec模型 2. :param path: word2vec 模型文件路径 :type path: str :return: """ if self.word2vec_model is None: w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(path) size = self.vocabulary_size embedding_weights = np.zeros((size, self.word2vec_model.vector_size)) words_count_no_in_w2v = 0 words_count_no_in_vacab = 0 words_count_in = 0 words_count_paddding = 0 for key, value in self.train_data_dict.token2id.items(): vector, flag = self.get_w2vEmbedding(key) embedding_weights[value, :] = vector if flag == 'NO_IN_W2V': words_count_no_in_w2v += 1 if flag == 'NO_IN_MODEL_VOCAB': words_count_no_in_vacab += 1 if flag == 'OK': words_count_in += 1 # print(key) if flag == 'PADDING': words_count_paddding += 1 if self.verbose > 0: print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v)) print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab)) print('出现在w2v模型中的词有:%d个' % (words_count_in)) # self.embedding_weights = embedding_weights return embedding_weights def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray( map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get( u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get( u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [ token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1]) ] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray([ self.word_similarity(word2vec_model, item, i) for i in self.vocabulary ]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 :param word1: :param word2: :return: ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def sentence_to_index(self, sentence): """ 将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN Parameters ---------- sentence: str 以空格分割 """ if self.add_unkown_word: unknow_token_index = self.train_data_dict.token2id[u'UNKOWN'] else: unknow_token_index = 0 # 将训练库中所有句子的每个词映射到索引上,变成索引列表 index = [ self.train_data_dict.token2id.get(item, unknow_token_index) for item in sentence.split() ] if self.verbose > 0: if index.__contains__(unknow_token_index): print('出现字典OOV') print(sentence) print(index) # assert not index.__contains__(-1),u'出现OOV词' return index def sentence_padding(self, sentence): ''' 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING :type sentence: str :param sentence: 句子,词之间以 空格 分割 :return: 返回补齐后的句子,以空格分割 :type: str ''' padding_length = self.sentence_padding_length # print(sentence) sentence = sentence.split() sentence_length = len(sentence) # print(sentence_length) if sentence_length > padding_length: # logging.debug(u'对句子进行截断:%s' % (sentence)) sentence = sentence[:padding_length] # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) elif sentence_length < padding_length: should_padding_length = padding_length - sentence_length left_padding = np.asarray(['PADDING'] * (should_padding_length / 2)) right_padding = np.asarray( ['PADDING'] * (should_padding_length - len(left_padding))) if self.padding_mode == 'center': sentence = np.concatenate( (left_padding, sentence, right_padding), axis=0) elif self.padding_mode == 'left': sentence = np.concatenate( (left_padding, right_padding, sentence), axis=0) elif self.padding_mode == 'right': sentence = np.concatenate( (sentence, left_padding, right_padding), axis=0) elif self.padding_mode == 'none': sentence = sentence else: raise NotImplemented sentence = ' '.join(sentence) return sentence def sentence_index_to_onehot(self, index): ''' 注意:该方法跟[sentence_index_to_bow]的区别。 将词的索引转成 onehot 编码,比如: 索引 1 -->[ 0 , 0 , 0 , 0, 1] :param index: 一个词的字典索引 :type index: list :return: onehot 编码,shape为 (句子长度,字典长度) :rtype: np.array() ''' onehot_array = [] for item in index: temp = np.zeros(self.vocabulary_size, dtype=int) if item == 0: pass else: temp[item - 1] = 1 onehot_array.append(temp) # onehot_array = np.concatenate(onehot_array,axis=1) onehot_array = np.asarray(onehot_array) return onehot_array def sentence_index_to_bow(self, index): ''' 注意:该方法跟[word_index_to_onehot]的区别。 将句子的字典索引转成 词包向量 编码比如: [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0, 0] :param index: 一个句子的字典索引 :type index: list :return: bow 编码,长度为 字典长度 :rtype: np.array() ''' onehot_array = np.zeros(self.vocabulary_size, dtype=int) onehot_array[index] = 1 return onehot_array def batch_sentence_index_to_onehot_array(self, sentence_indexs): ''' 将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中 :return: onehot编码的数组 ''' self.onehot_array = np.asarray( map(self.sentence_index_to_onehot, sentence_indexs)) return self.onehot_array def fit_transform(self, train_data=None, test_data=None): return self.fit(train_data, test_data).transform(train_data) def fit(self, train_X=None, test_X=None): """ build feature encoder 1. 构建训练库字典 2. 分词,并将句子补齐到等长,补齐长度为: self.sentence_padding_length 3. 将训练句子转成字典索引形式 4. 将每个词的字典索引变成onehot向量 Parameters ---------- train_X: array-like 训练句子列表:['','',...,''] test_X: array-like 测试句子列表:['','',...,''] Returns ------- object: 编码后的列表 """ if not self.kwargs.get('update_dictionary', True): # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可 if self.vocabulary is not None: return self logging.debug('=' * 20) if train_X is None: logging.debug('没有输入训练数据!') assert False, '没有输入训练数据!' if test_X is None: logging.debug('构建字典需要全部数据,请输入测试数据!') assert False, '构建字典需要全部数据,请输入测试数据!' # region -------------- 1.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.构建训练库字典') print('1.构建训练库字典') # -------------- code start : 开始 ------------- # 构建训练库字典 self.build_dictionary(train_X, test_X) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.构建训练库字典 --------------- return self def transform_sentence(self, sentence): """ 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 分词 2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表 3. 每个词的字典索引变成onehot向量 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like """ assert self.train_data_dict is not None, '请先fit_transform()模型' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word( self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 1. 分词 --------------- # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- paded_sentence = self.sentence_padding(seg_sentence) sentence_index = self.sentence_to_index(paded_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 --------------- # region -------------- 3. 将每个词的字典索引变成onehot向量 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('3. 将每个词的字典索引变成onehot向量') print('3. 将每个词的字典索引变成onehot向量') # -------------- code start : 开始 ------------- if self.to_onehot_array: onehot_array = self.sentence_index_to_onehot(sentence_index) else: onehot_array = sentence_index # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 --------------- return onehot_array def transform(self, X): ''' 批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 直接调用 self.transform_sentence 进行处理 :param sentence: 输入句子 :type sentence: array-like :return: 补齐的字典索引 :rtype: array-like ''' index = map(lambda x: self.transform_sentence(x), X) # print train_index[:5] return np.asarray(index) def reset(self): """ 清理对象中的数据 - self.vocabulary """ self.vocabulary = None def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = { 'train_data_count': len(self.train_data), 'need_segmented': self.need_segmented, 'feature_type': self.feature_type, 'verbose': self.verbose, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'sentence_padding_length': self.sentence_padding_length, 'padding_mode': 'center', 'vocabulary_size': self.vocabulary_size, 'padding_token_index': self.padding_token_index, 'unknow_token_index': self.unknow_token_index, 'add_unkown_word': True, 'mask_zero': True, } pprint.pprint(detail) logging.debug(detail) return detail
class FeatureEncoder(object): """ Onehot特征编码器,将句子转成 onehot编码 函数列表为: 1. segment_sentence:对句子分词 2. build_dictionary:构建字典 3. sentence_to_index:将原始字符串句子转为字典索引列表 4. sentence_padding:将句子补齐 5. fit_transform:构建编码器并转换数据 6. transform_sentence:对句子编码 7. get_sentence_length:对句子长度计算 8. print_sentence_length_detail: 打印训练库句子详情. 9. print_model_descibe: 打印模型的详情. 10. sentence_index_to_bow: 将索引转为onehot数据 11. to_onehot_array: 生成训练库句子的onehot编码 12. reset: clear 数据 注意: 1. onehot编码 有两种形式:通过设置 to_onehot_array 切换 - 字典索引形式表示,补齐 (默认是这种形式) - onehot 向量 0. 训练库中所有词,包括未知词字符(UNKOWN),的字典索引都是从1开始分配的,索引0是作为填充字符所用。 1. 训练库字典大小 (vocabulary_size)是计入索引0的,计算训练库中所有词和填充字符(PADDING)未知词字符(UNKOWN),如果不使用可以关闭。 """ __version__ = '1.4' def __init__(self, need_segmented=True, verbose=0, full_mode=True, feature_type='seg', remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, sentence_padding_length=7, padding_mode='center', add_unkown_word=True, to_onehot_array=False, word2vec_to_solve_oov=False, **kwargs ): """ Onehot特征编码器,将句子转成 onehot 编码(以字典索引形式表示,补齐) 1. 初始化参数 2. build feature encoder :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param verbose: 数值越大,输出越详细 :type verbose: int :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,去重,比如 我要买手机--->我 要 买 手机 手 机 - word_seg_concat:分词后的字和词为单位,不去重,比如 我要买手机--->我 要 买 手 机 我 要 买 手机 :type feature_type: str :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param add_unkown_word: 训练库字典的设置选项,是否在字典中增加一个未知词字符(UNKOWN) :type add_unkown_word: bool :param sentence_padding_length: 句子的补齐(截断)长度,默认为7 :type sentence_padding_length: int :param padding_mode: 句子的补齐(截断)模式,有四种模式: 1. center:如果小于sentence_padding_length的话往两边补0;如果超出sentence_padding_length的话,直接在后面截断。 2. left:如果小于sentence_padding_length的话往左边补0;如果超出sentence_padding_length的话,直接在后面截断。 3. right:如果小于sentence_padding_length的话往右边补0;如果超出sentence_padding_length的话,直接在后面截断。 4. none:不补齐。 :type padding_mode: str :param to_onehot_array: 输出 onehot array,还是字典索引 array,默认为False,输出字典索引, :type to_onehot_array: bool :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param kwargs: - word2vec_model_file_path: - vocabulary_including_test_set: (default,True) - update_dictionary: (default,True) - 等 """ self.full_mode = full_mode self.feature_type = feature_type self.remove_stopword = remove_stopword self.verbose = verbose self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.add_unkown_word = add_unkown_word self.sentence_padding_length = sentence_padding_length self.padding_mode = padding_mode self.to_onehot_array = to_onehot_array self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.padding_mode in ['center', 'left', 'right', 'none'], 'padding mode 只能取: center,left,right,none' assert self.feature_type in ['word', 'seg', 'word_seg', 'word_seg_concat'], 'feature type 只能取: word,seg和word_seg' # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # UNKOWN字符的索引 self.unknow_token_index = None # PADDING字符的索引 self.padding_token_index = None # region NOTE: 这些变量不再维护,因为消耗内存 # 原始训练数据 # self.train_data = None # 切完词的句子 # self.segmented_sentences = None # 训练库句子的字典索引形式 # self.train_index = None # 训练库句子的补齐的字典索引形式 # self.train_padding_index = None # 训练库句子装成onehot array # endregion self.train_onehot_array = None # word2vec 模型 self.word2vec_model = None if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path'), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(kwargs.get('word2vec_model_file_path')) if verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # self.fit_transform() def segment_sentence(self, sentence): """ 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str """ if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) elif self.feature_type == 'word_seg_concat': # 先字后词拼接,不去重 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(word + seg) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def get_sentence_length(self, sentence): ''' 计算句子的长度,注意,这里的长度以词为单位,即分完词后统计。 1. 对句子分词 2. 对句子的词计算 :param sentence: 句子 :type sentence: str :return: 句子长度 :rtype: int ''' # 1. 分词 segmented_senence = self.segment_sentence(sentence) # 2. 统计 sentence_length = len(segmented_senence.split()) return sentence_length def print_sentence_length_detail( self, data=None, lengths=[7, 10, 15, 20,50,80,100], ): """ 打印训练库中句子的长度情况 :type lengths: list :param lengths: 长度界限列表 :return: 句子长度列表 :rtype: list """ if self.need_segmented: sentence_length = map(self.get_sentence_length, data) else: sentence_length = map(lambda x: len(x.split()), data) for l in lengths: le_this_len = sum(np.asarray(sentence_length) <= l) / (1.0 * len(sentence_length)) print('句子长度小于等于%d的有:%f' % (l, le_this_len)) print('句子长度情况为:%s' % (str(sentence_length))) print('句子最长长度为:%d' % (max(sentence_length))) print('句子最短长度为:%d' % (min(sentence_length))) print('句子平均长度为:%d' % (np.average(sentence_length))) return sentence_length def get_unkown_vector(self, ndim=50): rand = np.random.RandomState(1337) return rand.uniform(-0.25, 0.25, ndim) def get_w2vEmbedding(self, word): """ 返回词向量 Returns ------- (array,str) """ try: if word == u'PADDING': vector = np.zeros(self.word2vec_model.vector_size) flag = 'PADDING' elif word == u'UNKOWN': # 当训练 vector = self.get_unkown_vector(self.word2vec_model.vector_size) flag = 'NO_IN_MODEL_VOCAB' else: vector = self.word2vec_model[word] flag = 'OK' except: vector = self.get_unkown_vector(self.word2vec_model.vector_size) if self.verbose > 1: print('OOV: %s' % word) flag = 'NO_IN_W2V' return np.asarray(vector), flag def to_embedding_weight(self, path): """ 使用训练好的 word2vec 模型 将字典中每个词转为 word2vec向量,接着生成一个 Embedding层的初始权重形式,可用于初始化 Embedding 层的权重。 1. 加载word2vec模型 2. :param path: word2vec 模型文件路径 :type path: str :return: """ if self.word2vec_model is None: w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load(path) size = self.vocabulary_size embedding_weights = np.zeros((size, self.word2vec_model.vector_size)) words_count_no_in_w2v = 0 words_count_no_in_vacab = 0 words_count_in = 0 words_count_paddding = 0 for key, value in self.train_data_dict.token2id.items(): vector, flag = self.get_w2vEmbedding(key) embedding_weights[value, :] = vector if flag == 'NO_IN_W2V': words_count_no_in_w2v += 1 if flag == 'NO_IN_MODEL_VOCAB': words_count_no_in_vacab += 1 if flag == 'OK': words_count_in += 1 # print(key) if flag == 'PADDING': words_count_paddding += 1 if self.verbose > 0: print('没有出现在w2v模型中的词有:%d个' % (words_count_no_in_w2v)) print('没有出现在模型vocab中的词有:%d个' % (words_count_no_in_vacab)) print('出现在w2v模型中的词有:%d个' % (words_count_in)) # self.embedding_weights = embedding_weights return embedding_weights def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray(map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get(u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get(u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1])] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray( [self.word_similarity(word2vec_model, item, i) for i in self.vocabulary]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 :param word1: :param word2: :return: ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def sentence_to_index(self, sentence): """ 将 sentence 转换为 index,如果 token为OOV词,则分配为 UNKOWN Parameters ---------- sentence: str 以空格分割 """ if self.add_unkown_word: unknow_token_index = self.train_data_dict.token2id[u'UNKOWN'] else: unknow_token_index = 0 # 将训练库中所有句子的每个词映射到索引上,变成索引列表 index = [self.train_data_dict.token2id.get(item, unknow_token_index) for item in sentence.split()] if self.verbose > 1: if index.__contains__(unknow_token_index): print('unknow_token_index:%d' % unknow_token_index) print('出现字典OOV') print(sentence) print(index) # assert not index.__contains__(-1),u'出现OOV词' return index def sentence_padding(self, sentence): ''' 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING :type sentence: str :param sentence: 句子,词之间以 空格 分割 :return: 返回补齐后的句子,以空格分割 :type: str ''' padding_length = self.sentence_padding_length # print(sentence) sentence = sentence.split() sentence_length = len(sentence) # print(sentence_length) if sentence_length > padding_length: # logging.debug(u'对句子进行截断:%s' % (sentence)) sentence = sentence[:padding_length] # logging.debug(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) # print(u'对句子进行截断后:%s' % (' '.join(seg[:padding_length]))) elif sentence_length < padding_length: should_padding_length = padding_length - sentence_length left_padding = np.asarray(['PADDING'] * (should_padding_length / 2)) right_padding = np.asarray(['PADDING'] * (should_padding_length - len(left_padding))) if self.padding_mode == 'center': sentence = np.concatenate((left_padding, sentence, right_padding), axis=0) elif self.padding_mode == 'left': sentence = np.concatenate((left_padding, right_padding, sentence), axis=0) elif self.padding_mode == 'right': sentence = np.concatenate((sentence, left_padding, right_padding), axis=0) elif self.padding_mode == 'none': sentence = sentence else: raise NotImplemented sentence = ' '.join(sentence) return sentence def sentence_index_to_onehot(self, index): ''' 注意:该方法跟[sentence_index_to_bow]的区别。 将词的索引转成 onehot 编码,比如: 索引 1 -->[ 0 , 0 , 0 , 0, 1] :param index: 一个词的字典索引 :type index: list :return: onehot 编码,shape为 (句子长度,字典长度) :rtype: np.array() ''' onehot_array = [] for item in index: temp = np.zeros(self.vocabulary_size, dtype=int) if item == 0: pass else: temp[item - 1] = 1 onehot_array.append(temp) # onehot_array = np.concatenate(onehot_array,axis=1) onehot_array = np.asarray(onehot_array) return onehot_array def sentence_index_to_bow(self, index): ''' 注意:该方法跟[word_index_to_onehot]的区别。 将句子的字典索引转成 词包向量 编码比如: [1,2]-->[ 0 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0, 0] :param index: 一个句子的字典索引 :type index: list :return: bow 编码,长度为 字典长度 :rtype: np.array() ''' onehot_array = np.zeros(self.vocabulary_size, dtype=int) onehot_array[index] = 1 return onehot_array def batch_sentence_index_to_onehot_array(self, sentence_indexs): ''' 将所有训练库句子转成onehot编码的数组,保存在 self.onehot_array 中 :return: onehot编码的数组 ''' self.onehot_array = np.asarray(map(self.sentence_index_to_onehot, sentence_indexs)) return self.onehot_array def fit_transform(self, train_data=None, test_data=None): return self.fit(train_data, test_data).transform(train_data) def fit(self, train_X=None, test_X=None ): """ build feature encoder ---- 构建训练库字典 Notes ------ update_dictionary: 设置 再次调用fit()函数时,是否更新字典,默认为 True,即只在第一次调用fit()函数时才更新 字典 vocabulary_including_test_set: 设置 是否 字典是否包含测试集的词汇,默认包含,即字典包含训练集和测试集的所有词汇。 - 设置为 False ,则 字典只包含训练集中的词汇 Parameters ---------- train_X: array-like 训练句子列表:['','',...,''] test_X: array-like 测试句子列表:['','',...,''] Returns ------- object: 编码后的列表 """ if not self.kwargs.get('update_dictionary', True): # 假如不更新字典,则如果原有的字典在,就直接用原有的字典即可 if self.vocabulary is not None: return self logging.debug('=' * 20) if train_X is None: logging.debug('没有输入训练数据!') assert False, '没有输入训练数据!' if self.kwargs.get('vocabulary_including_test_set', True): if test_X is None: logging.debug('vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!') assert False, 'vocabulary_including_test_set=True,构建字典需要全部数据,请输入测试数据!' # region -------------- 1.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.构建训练库字典') print('1.构建训练库字典') # -------------- code start : 开始 ------------- # 构建训练库字典 self.build_dictionary(train_X, test_X) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.构建训练库字典 --------------- return self def transform_sentence(self, sentence): """ 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 分词 2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表 - 当 参数 to_onehot_array = True (默认为 False)时,直接返回 字典索引 ; - 当 参数 to_onehot_array = False (默认为 False)时,进入第3步,进一步转换成 onehot 向量 ; 3. 每个词的字典索引变成onehot向量 - 这一步不一定会执行 - to_onehot_array = True 时, 执行 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like """ assert self.train_data_dict is not None, '请先fit_transform()模型' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word(self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 1. 分词 --------------- # region -------------- 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- paded_sentence = self.sentence_padding(seg_sentence) sentence_index = self.sentence_to_index(paded_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 2. 转为字典索引列表,之后补齐,输出为补齐的字典索引列表 --------------- # region -------------- 3. 将每个词的字典索引变成onehot向量 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('3. 将每个词的字典索引变成onehot向量') print('3. 将每个词的字典索引变成onehot向量') # -------------- code start : 开始 ------------- if self.to_onehot_array: onehot_array = self.sentence_index_to_onehot(sentence_index) else: onehot_array = sentence_index # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- region end : 3. 将每个词的字典索引变成onehot向量 --------------- return onehot_array def transform(self, X): ''' 批量转换数据,跟训练数据一样的操作,对输入句子进行 padding index 编码,将sentence转为补齐的字典索引 1. 直接调用 self.transform_sentence 进行处理 :param sentence: 输入句子 :type sentence: array-like :return: 补齐的字典索引 :rtype: array-like ''' index = map(lambda x: self.transform_sentence(x), X) # print train_index[:5] return np.asarray(index) def reset(self): """ 清理对象中的数据 - self.vocabulary """ self.vocabulary = None def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = {'train_data_count': len(self.train_data), 'need_segmented': self.need_segmented, 'feature_type': self.feature_type, 'verbose': self.verbose, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'sentence_padding_length': self.sentence_padding_length, 'padding_mode': 'center', 'vocabulary_size': self.vocabulary_size, 'padding_token_index': self.padding_token_index, 'unknow_token_index': self.unknow_token_index, 'add_unkown_word': True, 'mask_zero': True, } pprint.pprint(detail) logging.debug(detail) return detail
class FeatureEncoder(object): ''' ## 简介 BOW特征编码器:基于sklearn的CountVectorizer,TfidfVectorizer实现,将句子转成 BOW(计算)或者TFIDF编码。 ## 目前支持两种粒度的切分: 字(word) 和 分词后的词(seg) 包含以下主要函数: 1. segment_sentence:对句子分词 2. transform_sentence:buildin,对一个句子编码 3. fit_transform:构建编码器并转换数据 4. transform: 转换数据 5. print_sentence_length_detail: todo,打印训练库句子详情. 6. print_model_descibe: 打印模型的详情. ''' def __init__( self, # rand_seed=1337, verbose=0, need_segmented=True, full_mode=True, remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, feature_method='bow', feature_type='seg', max_features=None, word2vec_to_solve_oov=False, save_middle_result=False, **kwargs): ''' 1. 初始化参数,并验证参数合法性 2. build feature encoder :param verbose: 数值越大,输出越详细 :type verbose: int :param need_segmented: 数据处理选项,是否需要经过分词处理;如果为False,那么输入的数据不需要分词,提供的数据的每个句子的每个词要以空格分割.比如: ['我 要 买 手机','你好','早上 好'];如果为True,提供原始输入句子即可,比如:['我要买手机','你好','早上好']. :type need_segmented: bool :param full_mode: jieba分词选项,是否使用 full mode,默认为True :type full_mode: bool :param remove_stopword: jieba分词选项,是否去除 stop word,默认为True :type remove_stopword: bool :param replace_number: jieba分词选项,是否将数据统一替换成NUM,默认为True :type replace_number: bool :param lowercase: jieba分词选项,是否将数据统一替换成NUM,默认为True :type lowercase: bool :param zhs2zht: jieba分词选项,出現繁体的時候,是否转简体,默认为True :type zhs2zht: bool :param remove_url: jieba分词选项,是否移除 微博url,http://t.cn/开头的地址,默认为True :type remove_url: bool :param feature_method: 模型设置选项,选择 bow或者tfidf 特征计算方法 :type feature_method: str :param feature_type: 模型设置选项,选择不同粒度的特征单位, 目前只支持 word,seg和 word_seg。 - word:直接以字为单位,比如 我要买手机--->我 要 买 手 机 - seg:分词后的词单元为单位,比如 我要买手机--->我 要 买 手机 - word_seg:分词后的字和词为单位,比如 我要买手机--->我 要 买 手机 手 机 :type feature_type: str :param max_features: 模型设置选项,特征选择的最大特征词数 :type max_features: int :param word2vec_to_solve_oov: 使用word2vec扩展oov词 :type word2vec_to_solve_oov: bool :param save_middle_result: 是否保存中间结果,为了节约空间默认关闭! :type save_middle_result: bool :param kwargs: 支持 word2vec_model_file_path等 :type kwargs: dict ''' # self.rand_seed = rand_seed self.save_middle_result = save_middle_result self.verbose = verbose self.full_mode = full_mode self.remove_stopword = remove_stopword self.need_segmented = need_segmented self.replace_number = replace_number self.lowercase = lowercase self.zhs2zht = zhs2zht self.remove_url = remove_url self.feature_method = feature_method self.feature_type = feature_type self.max_features = max_features self.word2vec_to_solve_oov = word2vec_to_solve_oov self.kwargs = kwargs # 检验参数合法性 assert self.feature_method in ['bow', 'tfidf' ], 'feature method 只能取: bow,tfidf' assert self.feature_type in ['word', 'seg', 'word_seg' ], 'feature type 只能取: word,seg和word_seg' if word2vec_to_solve_oov: # 加载word2vec模型 if word2vec_to_solve_oov: assert kwargs.has_key('word2vec_model_file_path' ), '请提供 属性 word2vec_model_file_path' # 加载word2vec模型 w2v_util = Word2vecUtil() self.word2vec_model = w2v_util.load( kwargs.get('word2vec_model_file_path')) # 初始化jieba分词器 if need_segmented: self.jieba_seg = Jieba_Util(verbose=self.verbose) # 特征编码器: bow or tf-idf transformer self.feature_encoder = None # 训练库提取出来的字典对象 self.train_data_dict = None # 训练库提取出来的字典词汇列表 self.vocabulary = None # 训练库提取出来的字典词汇个数 self.vocabulary_size = None # 训练样例的个数 self.train_data_count = 0 # region 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: # 原始训练数据 self.train_data = None # 切完词的句子 self.segmented_sentences = None # 训练句子特征 self.train_features = None # endregion # word2vec 模型 # self.word2vec_model = None # self.fit_transform() def segment_sentence(self, sentence): ''' 对句子进行分词,使用jieba分词 :param sentence: 句子 :type sentence: str :return: 分完词句子,以空格连接 :rtype: str ''' if self.feature_type == 'seg': segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) elif self.feature_type == 'word': # 将句子切分为 以字为单元 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.iter_each_word( sentence, sep=' ', need_segmented=True, full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # 2. 按字切分 elif self.feature_type == 'word_seg': # 将句子切分为 以字和词为单元,相同则去重 以空格分割 # 1. 先使用jieba进行预处理,将数字替换等 segmented_sentence = self.jieba_seg.seg( sentence, sep=' ', full_mode=self.full_mode, remove_stopword=self.remove_stopword, replace_number=self.replace_number, lowercase=self.lowercase, zhs2zht=self.zhs2zht, remove_url=self.remove_url, HMM=False, ) # print(segmented_sentence) # 2. 按字切分 word = self.jieba_seg.iter_each_word(segmented_sentence, sep=' ', need_segmented=False).split() # 3. 按词切分 seg = segmented_sentence.split() segmented_sentence = ' '.join(set(seg + word)) else: assert False, '不支持其他粒度的切分!' return segmented_sentence def reset(self): """重置对象 Returns ------- """ self.feature_encoder = None def fit_transform(self, train_data=None, test_data=None): """ build feature encoder 1. fit 2. transform拟合数据 :param train_data: 训练句子列表:['','',...,''] :type train_data: array-like. :return: train_data 编码后的向量 """ # 训练样例的个数 self.train_data_count = len(train_data) return self.fit(train_data, test_data).transform(train_data) def fit(self, train_data=None, test_data=None): """ build feature encoder 1. 转换数据格式,并分词 2. 构建vectorizer :param train_data: 训练句子列表:['','',...,''] :type train_data: array-like. :return: train_data 编码后的向量 """ if self.verbose > 1: logging.debug('build feature encoder...') print('build feature encoder...') # -------------- region start : 1. 转换数据格式,并分词 ------------- if self.verbose > 2: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 转换数据格式,并分词') print('1. 转换数据格式,并分词') # -------------- code start : 开始 ------------- assert train_data is not None, '没有输入训练数据!' train_data = np.asarray(train_data) # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) if self.save_middle_result: self.train_data = train_data if self.need_segmented: # 分词 train_segmented_sentences = map(self.segment_sentence, train_data) else: # 不需要分词 train_segmented_sentences = train_data # -------------- code start : 结束 ------------- if self.verbose > 2: logging.debug('-' * 20) print('-' * 20) # -------------- region end : 1. 转换数据格式,并分词 --------------- if self.feature_encoder is None: # 当 feature_encoder 还没创建过时,则创建 if self.feature_method == 'tfidf': self.feature_encoder = TfidfVectorizer( analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, # vocabulary = tfidf_vocabulary, max_features=self.max_features, ) elif self.feature_method == 'bow': self.feature_encoder = CountVectorizer( analyzer="word", token_pattern=u'(?u)\\b\w+\\b', tokenizer=None, preprocessor=None, lowercase=False, stop_words=None, # vocabulary = tfidf_vocabulary, max_features=self.max_features, ) else: raise NotImplementedError self.feature_encoder.fit_transform( train_segmented_sentences).toarray() # 为了节约内存空间,实际运行中时,建议设置 save_middle_result = False(关闭中间结果的保存) # if self.save_middle_result: # self.train_features = train_features # 字典 self.vocabulary = self.feature_encoder.get_feature_names() # 字典个数 self.vocabulary_size = len(self.vocabulary) return self def word_similarity(self, word2vec_model, word1, word2): ''' 计算两个词的相似性 Parameters ---------- word2vec_model : gensim object word2vec_model gensim Word2Vec model word2: word1: Returns -------- similarity score: float ''' try: return word2vec_model.n_similarity(word1, word2) except: return 0 def replace_oov_with_similay_word(self, word2vec_model, sentence): ''' 对句子中oov词使用训练库中最相近的词替换(word2vec余弦相似性) :param sentence: :return: ''' # is_oov = np.asarray([item for item in self.feature_encoder.vocabulary]) # has_oov = any(is_oov) sentence = sentence.split() oov_word = [] replace_word = [] for item in sentence: if item not in self.vocabulary: oov_word.append(item) keywords_sim_score = np.asarray([ self.word_similarity(word2vec_model, item, i) for i in self.vocabulary ]) sorted_index = np.argsort(keywords_sim_score)[-1::-1] most_similarity_score = keywords_sim_score[sorted_index[0]] most_similarity_word = self.vocabulary[sorted_index[0]] if self.verbose > 1: print(u'%s 最相近的词是%s,分数为:%f' % (item, most_similarity_word, most_similarity_score)) replace_word.append(most_similarity_word) sentence += replace_word return ' '.join(sentence) def transform_sentence( self, sentence, ): ''' 转换一个句子的格式。跟训练数据一样的操作,对输入句子进行 bow或tfidf 编码。 1. 分词 2. 编码 :param sentence: 输入句子,不用分词,进来后会有分词处理 :type sentence: str :return: 补齐的字典索引 :rtype: array-like ''' # region -------------- 1. 分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1. 分词') print('1. 分词') # -------------- code start : 开始 ------------- # 分词 if self.need_segmented: seg_sentence = self.segment_sentence(sentence) else: seg_sentence = sentence if self.word2vec_to_solve_oov: seg_sentence = self.replace_oov_with_similay_word( self.word2vec_model, seg_sentence) # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1. 分词 --------------- # region -------------- 2. 特征转换 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') print('2. 转为字典索引列表,之后补齐,输入为补齐的字典索引列表') # -------------- code start : 开始 ------------- features = self.feature_encoder.transform([seg_sentence]).toarray()[0] # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2. 特征转换 --------------- return features def transform( self, data, ): ''' 批量转换数据,跟 transform_sentence()一样的操作 1. 直接调用 self.transform_sentence 进行处理 :param data: 输入句子集合 :type data: array-like :return: bow vector :rtype: array-like ''' index = map(self.transform_sentence, data) # print(index[:5]) return np.asarray(index) def print_model_descibe(self): ''' 打印模型参数详情 :return: 参数设置详情 :rtype: dict 或 {} ''' import pprint detail = { 'train_data_count': self.train_data_count, 'need_segmented': self.need_segmented, 'word2vec_to_solve_oov': self.word2vec_to_solve_oov, 'vocabulary_size': self.vocabulary_size, 'verbose': self.verbose, # 'rand_seed': self.rand_seed, 'full_mode': self.full_mode, 'remove_stopword': self.remove_stopword, 'replace_number': self.replace_number, 'lowercase': self.lowercase, 'zhs2zht': self.zhs2zht, 'remove_url': self.remove_url, 'feature_method': self.feature_method, 'feature_type': self.feature_type, 'max_features': self.max_features, } pprint.pprint(detail) logging.debug(detail) return detail
train_data = train_data[['LABEL','SENTENCE']] test_data = test_data[['LABEL','SENTENCE']] logging.debug('=' * 20) logging.debug('对数据进行分词...') logging.debug('-' * 20) jutil = Jieba_Util() if config['feature_type'] == 'word': sentence_to_seg = lambda x: jutil.iter_each_word( sentence=x, need_segmented=True, sep=' ', full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, ) else: sentence_to_seg = lambda x: jutil.seg( sentence=x, sep=' ', full_mode=config['full_mode'], remove_stopword=config['remove_stopword'], replace_number=True, lowercase=True, zhs2zht=True, remove_url=True,