Ejemplo n.º 1
0
def build_vocabulary(inFile, dtype, vocabFile):
    # 文本长度 200
    MAX_LENGTH = 300
    NB_CLASSES = 2

    # 读入分词后文本
    doc = MySentences(inFile, dtype, 'get_content')
    # 把原始文本映射到index
    processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5)
    processor.fit(doc)
    processor.save(vocabFile)
Ejemplo n.º 2
0
class DatasetVectorizer:
    def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            self.sentences_lengths = [
                len(str(x).split(' ')) for x in list(raw_sentence_pairs)
            ]
            max_sentence_length = max(self.sentences_lengths)
            self.vocabulary = VocabularyProcessor(max_sentence_length)
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()

        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)

        vectorized_sentence_pairs = np.array(
            list(self.vocabulary.transform(raw_sentence_pairs)))

        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(
            num_instances, num_classes, self.max_sentence_len)

        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
Ejemplo n.º 3
0
    def _create_vocab(min_count=3):
        """
        创建词汇表
        需要训练样本
        """
        def gen_documents():
            for path in (TRAIN_POS_PATH, TRAIN_NEG_PATH):
                with codecs.open(path, 'r', 'utf-8') as file:
                    for line in file:
                        yield line[:-1]

        vocab = VocabularyProcessor(SEQUENCE_LEN,
                                    min_count - 1,
                                    tokenizer_fn=chinese_tokenizer)
        vocab.fit(gen_documents())
        return vocab
def make_vocab_processor(name,text,max_length,min_frequency):
    ''''
    generate vocab model
    '''
    print('Making vocabulary model...')
    vp = VocabularyProcessor(max_length, min_frequency=min_frequency)
    vp = vp.fit(text)
    if name == None:
        return vp
    else:
        print('Saving vocabulary model to {}'.format(name))
        vp.save(name)
        return vp
Ejemplo n.º 5
0
    """

    for document in documents:
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))


# 序列长度填充或截取到100,删除词频<=2的词
vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)

# 创建词汇表,创建后不能更改
vocab.fit(DOCUMENTS)

# 保存和加载词汇表
vocab.save('vocab.pickle')
vocab = VocabularyProcessor.restore('vocab.pickle')

# 文本转为词ID序列,未知或填充用的词ID为0
id_documents = list(vocab.transform(DOCUMENTS))
for id_document in id_documents:
    print(id_document)
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
class DatasetVectorizer:
    
    def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            if char_embeddings:
                log('Chosen char embeddings.')
                self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)]
            else:
                log('Chosen word embeddings.')
                self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)]
            max_sentence_length = max(self.sentences_lengths)
            log('Maximum sentence length : {}'.format(max_sentence_length))
            
            if char_embeddings:
                log('Processing sentences with char embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                    tokenizer_fn=char_tokenizer,
                )
            else:
                log('Processing sentences with word embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                )
            log('Sentences have been successfully processed.')
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))
    
    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length
    
    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)
    
    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
    
    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))
    
    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()
        
        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)
        
        vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs)))
        
        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes,
                                                                      self.max_sentence_len)
        
        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2