def build_vocabulary(inFile, dtype, vocabFile): # 文本长度 200 MAX_LENGTH = 300 NB_CLASSES = 2 # 读入分词后文本 doc = MySentences(inFile, dtype, 'get_content') # 把原始文本映射到index processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5) processor.fit(doc) processor.save(vocabFile)
class DatasetVectorizer: def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] self.sentences_lengths = [ len(str(x).split(' ')) for x in list(raw_sentence_pairs) ] max_sentence_length = max(self.sentences_lengths) self.vocabulary = VocabularyProcessor(max_sentence_length) self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format( self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array( list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape( num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2
def _create_vocab(min_count=3): """ 创建词汇表 需要训练样本 """ def gen_documents(): for path in (TRAIN_POS_PATH, TRAIN_NEG_PATH): with codecs.open(path, 'r', 'utf-8') as file: for line in file: yield line[:-1] vocab = VocabularyProcessor(SEQUENCE_LEN, min_count - 1, tokenizer_fn=chinese_tokenizer) vocab.fit(gen_documents()) return vocab
def make_vocab_processor(name,text,max_length,min_frequency): '''' generate vocab model ''' print('Making vocabulary model...') vp = VocabularyProcessor(max_length, min_frequency=min_frequency) vp = vp.fit(text) if name == None: return vp else: print('Saving vocabulary model to {}'.format(name)) vp.save(name) return vp
""" for document in documents: # 繁体转简体 text = HanziConv.toSimplified(document) # 英文转小写 text = text.lower() # 分词 yield list(cut(text)) # 序列长度填充或截取到100,删除词频<=2的词 vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer) # 创建词汇表,创建后不能更改 vocab.fit(DOCUMENTS) # 保存和加载词汇表 vocab.save('vocab.pickle') vocab = VocabularyProcessor.restore('vocab.pickle') # 文本转为词ID序列,未知或填充用的词ID为0 id_documents = list(vocab.transform(DOCUMENTS)) for id_document in id_documents: print(id_document) # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
class DatasetVectorizer: def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] if char_embeddings: log('Chosen char embeddings.') self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)] else: log('Chosen word embeddings.') self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)] max_sentence_length = max(self.sentences_lengths) log('Maximum sentence length : {}'.format(max_sentence_length)) if char_embeddings: log('Processing sentences with char embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, tokenizer_fn=char_tokenizer, ) else: log('Processing sentences with word embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, ) log('Sentences have been successfully processed.') self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2