Ejemplo n.º 1
0
def tokenizing_and_vocabulary(train_posts, test_posts, train_tags, test_tags):
    # 20 news groups
    num_labels = 20
    vocab_size = 15000
    batch_size = 100

    #define Tokenizer with Vocab size
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(train_posts)

    x_train = tokenizer.text_to_matrix(train_posts, mode='tfidf')
    x_text = tokenizer.text_to_matrix(test_posts, mode='tfidf')

    encoder = LabelBinarizer()
    encoder.fit(train_tags)
    y_train = encoder.transform(train_tags)
    y_test = encoder.transform(test_tags)
Ejemplo n.º 2
0
from keras.preprocessing.text import Tokenizer

samples = ['I study at CityU', 'I study at CityU at Seattle']

tokenizer = Tokenizer(num_words=1000)

tokenizer.fit_on_texts(samples)

sequences = tokenizer.text_to_sequences(samples)

one_hot_results = tokenizer.text_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index
print('Found %s unique tokesn: ' % len(word_index))
print('Sequences: ', sequences, '\n')
print('word_index: ', tokenizer.word_index)
Ejemplo n.º 3
0
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer

text1='some thing to eat'
text2='some thing to drink'
texts=[text1,text2]

print T.text_to_word_sequence(text1)  #['some', 'thing', 'to', 'eat']
print T.one_hot(text1,10)  #[7, 9, 3, 4]
print T.one_hot(text2,10)  #[7, 9, 3, 1]

tokenizer = Tokenizer(num_words=10)
tokenzier.fit_on_text(texts)
print tokenizer.word_count #[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]
print tokenizer.word_index #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}
print tokenizer.word_docs #{'some': 2, 'thing': 2, 'to': 2, 'drink': 1,  'eat': 1}
print tokenizer.index_docs #{1: 2, 2: 2, 3: 2, 4: 1, 5: 1}

print tokenizer.text_to_sequences(texts) #[[1, 2, 3, 4], [1, 2, 3, 5]]
print tokenizer.text_to_matrix(texts) #
[[ 0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
 [ 0.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.]]

import keras.preprocessing.sequence as S
S.pad_sequences([[1,2,3]],10,padding='post')
--------------------- 
作者:vivian_ll 
来源:CSDN 
原文:https://blog.csdn.net/vivian_ll/article/details/80795139 
版权声明:本文为博主原创文章,转载请附上博文链接!