def build_dictionary(self): logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 构建训练库字典 # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] train_document = map(lambda x: x.split(),self.__seg_sentence__) gensim_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # U表示未知字符,即OOV词汇 gensim_dict.add_documents([[u'UNKOWN']]) logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys()))) # print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys())) self.__gensim_dict__ = gensim_dict self.__vocabulary_size__ = len(gensim_dict.keys()) logging.debug('训练库字典为:%d' % (self.__vocabulary_size__)) print '训练库字典为:%d' % self.__vocabulary_size__ logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))) print u'字典有:%s' % (','.join(gensim_dict.token2id.keys())) # word2embedding = {} # unknow_token_index = self.__gensim_dict__.token2id[u'UNKOWN'] embedding_weights = np.zeros((self.__vocabulary_size__ + 1, self.__word_embedding_length__ )) for key,value in gensim_dict.token2id.items(): embedding_weights[value,:] = self.get_w2vEmbedding(key) # todo 创建词向量字典 self.__embedding_weights__ = embedding_weights
def build_dictionary(self): logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 构建训练库字典 # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] train_document = map(lambda x: x.split(),self.__seg_sentence__) gensim_dict = Dictionary.from_documents(train_document) logging.debug('训练库字典为:%d' % (len(gensim_dict.keys()))) print '训练库字典为:%d' % (len(gensim_dict.keys())) # 更新字典,再字典中添加特殊符号,其中 # U表示未知字符,即OOV词汇 gensim_dict.add_documents([[u'UNKOWN']]) logging.debug('更新字典,再字典中添加特殊符号(UNKOWN),之后字典大小为:%d' % (len(gensim_dict.keys()))) print '更新字典,再字典中添加特殊符号,之后字典大小为:%d' % (len(gensim_dict.keys())) logging.debug(u'字典有:%s' % (','.join(gensim_dict.token2id.keys()))) print u'字典有:%s' % (','.join(gensim_dict.token2id.keys())) self.__gensim_dict__ = gensim_dict
def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray( map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get( u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get( u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [ token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1]) ] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences
def build_dictionary(self, train_X=None, test_X=None): """ 1.对数据进行分词 2.构建训练库字典,插入 一个特殊字符 'UNKOWN'表示未知词 Parameters ---------- train_X : array-like test_X : array-like Returns -------- object: self """ # region -------------- 1.将训练集和测试集合并 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('1.将训练集和测试集合并') print('1.将训练集和测试集合并') if self.kwargs.get('vocabulary_including_test_set', True): X = np.concatenate((train_X, test_X), axis=0) else: X = train_X if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 1.将训练集和测试集合并 --------------- # region -------------- 2.对数据进行分词 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('对数据进行分词') print('对数据进行分词') # -------------- code start : 开始 ------------- if self.need_segmented: segmented_sentences = map(self.segment_sentence, X) else: segmented_sentences = X # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 2.对数据进行分词 --------------- # region -------------- 3. 将句子补齐到等长 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('2. 将句子补齐到等长') print('2. 将句子补齐到等长') # -------------- code start : 开始 ------------- # 将不等长的句子都对齐,超出padding_length长度的句子截断,小于的则补 PADDING padded_sentences = np.asarray(map(self.sentence_padding, segmented_sentences)) # endregion -------------- 3. 将句子补齐到等长 ------------- # region -------------- region start : 4.构建训练库字典 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) logging.debug('4.构建训练库字典') print('4.构建训练库字典') # -------------- code start : 开始 ------------- logging.debug('=' * 20) logging.debug('首先,构建训练库字典,然后将每个词映射到一个索引,再将所有句子映射成索引的列表') # 将训练库所有句子切分成列表,构成 2D的训练文档,每个单元是一个token, # 比如: [['今年','你','多少岁'],['你', '二十四','小时','在线','吗'],...] # 将分完词句子转成合适的数据格式 train_document = map(lambda x: x.split(), padded_sentences) # 获取训练库字典 if self.padding_mode != 'none': # 为了确保padding的索引是0,所以在最前面加入 PADDING train_document.insert(0, [u'PADDING']) self.train_data_dict = Dictionary.from_documents(train_document) # 更新字典,再字典中添加特殊符号,其中 # UNKOWN表示未知字符,即OOV词汇 if self.add_unkown_word: self.train_data_dict.add_documents([[u'UNKOWN']]) # 获取padding和UNKOWN 的字典索引 self.padding_token_index = self.train_data_dict.token2id.get(u'PADDING', -1) self.unknow_token_index = self.train_data_dict.token2id.get(u'UNKOWN', -1) self.vocabulary_size = len(self.train_data_dict.keys()) # 按索引从小到大排序 self.vocabulary = [token for token, id in sorted(self.train_data_dict.token2id.items(), key=lambda x: x[1])] # print(self.vocabulary_size) # print((self.train_data_dict.token2id.items())) # quit() # -------------- print start : just print info ------------- if self.verbose > 1: logging.debug('训练库字典为:%d' % (len(self.train_data_dict.keys()))) print('训练库字典为:%d' % (len(self.train_data_dict.keys()))) logging.debug(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) print(u'字典有:%s' % (','.join(self.train_data_dict.token2id.keys()))) # -------------- print end : just print info ------------- # -------------- code start : 结束 ------------- if self.verbose > 1: logging.debug('-' * 20) print('-' * 20) # endregion -------------- 4.构建训练库字典 --------------- return padded_sentences