Python Tokenizer.sequences_to_matrix Examples

Programming Language: Python

Namespace/Package Name: keras_preprocessing.text

Class/Type: Tokenizer

Method/Function: sequences_to_matrix

Examples at hotexamples.com: 5

Python Tokenizer.sequences_to_matrix - 5 examples found. These are the top rated real world Python examples of keras_preprocessing.text.Tokenizer.sequences_to_matrix extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(30)

texts_to_sequences(30)

fit_on_texts(14)

texts_to_matrix(13)

sequences_to_matrix(5)

sequences_to_texts(5)

to_json(5)

fit_on_sequences(1)

num_words(1)

word_index(1)

Example #1

Show file

File: training.py Project: mieczkowski-m/inl-zal

    def train_model(self):
        tokenizer = Tokenizer(num_words=self.max_words)
        x_train = tokenizer.sequences_to_matrix(self.x_train, mode='binary')
        x_test = tokenizer.sequences_to_matrix(self.x_test, mode='binary')

        y_train = tokenizer.sequences_to_matrix(self.y_train, mode='binary')
        y_test = tokenizer.sequences_to_matrix(self.y_test, mode='binary')
        # y_train = keras.utils.to_categorical(self.y_train, self.num_classes)
        # y_test = keras.utils.to_categorical(self.y_test, self.num_classes)

        self.model = Sequential()
        self.model.add(Dense(1024, input_shape=(self.max_words, )))
        self.model.add(Activation('tanh'))
        self.model.add(Dense(self.num_classes))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_squared_error',
                           optimizer='rmsprop',
                           metrics=['binary_accuracy'])

        history = self.model.fit(x_train,
                                 y_train,
                                 batch_size=self.batch_size,
                                 epochs=self.epochs,
                                 verbose=1,
                                 validation_data=(x_test, y_test))

        pyplot.plot(history.history['loss'])
        pyplot.plot(history.history['val_loss'])
        pyplot.title('model train vs validation loss')
        pyplot.ylabel('loss')
        pyplot.xlabel('epoch')
        pyplot.legend(['train', 'validation'], loc='upper right')
        pyplot.show()

Example #2

Show file

File: load_datasets.py Project: tom-doerr/labelfix

    def closure(mu):
        (x_train, y_train), (_, _) = imdb.load_data()
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_sequences(x_train)
        x_train = tokenizer.sequences_to_matrix(x_train, "tfidf")
        # Note: svd_solver=full is needed on GPU server
        x_train = PCA(n_components=100, svd_solver='full').fit_transform(x_train)
        ds = {"data": x_train, "target": y_train}

        # Apply noise and return
        res = preprocess_and_noise(dataset=ds, mu=mu)
        return res

Example #3

Show file

print(y_train[0])

print(word_index['the'])

index_to_word = {}

for key, value in word_index.items():
    index_to_word[value] = key

print(' '.join(index_to_word[x] for x in x_train[9]))

from keras_preprocessing.text import Tokenizer
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train.shape)
print(x_train[0])

print(y_train.shape)
print(y_train[0])

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()

Example #4

Show file

File: MyData.py Project: alr888rav/pynn

    def load(self):
        # Setup train and test splits
        if self.source in ['mnist', 'cifar10']:
            (x_train, y_train), (x_test,
                                 y_test) = self.db(self.source).load_data()
        elif self.source == 'fashion_mnist':
            (x_train, y_train), (x_test,
                                 y_test) = self.db(self.source).load_data()
            x_train = x_train / 255.0
            x_test = x_test / 255.0
            self.class_names = [
                'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'
            ]
        elif self.source == 'cifar100':
            (x_train, y_train), (x_test, y_test) = self.db(
                self.source).load_data(label_mode='fine')
        elif self.source == 'imdb':
            (x_train, y_train), (x_test,
                                 y_test) = self.db(self.source).load_data(
                                     path="imdb.npz",
                                     num_words=10000,  #memory limitation
                                     skip_top=0,
                                     maxlen=None,
                                     seed=113,
                                     start_char=1,
                                     oov_char=2,
                                     index_from=3)

            data = np.concatenate((x_train, x_test), axis=0)
            targets = np.concatenate((y_train, y_test), axis=0)
            data = self.__vectorize(data)
            targets = np.array(targets).astype("float32")
            self.x_test = data[:10000]
            self.y_test = targets[:10000]
            self.x_train = data[10000:]
            self.y_train = targets[10000:]
            self.is_text = True

        elif self.source == 'reuters':
            (x_train, y_train), (x_test, y_test) = self.db(
                self.source).load_data(path="reuters.npz",
                                       num_words=None,
                                       skip_top=0,
                                       maxlen=None,
                                       test_split=0.2,
                                       seed=113,
                                       start_char=1,
                                       oov_char=2,
                                       index_from=3)
            self.word_index = self.db(
                self.source).get_word_index(path="reuters_word_index.json")
            self.is_text = True
        else:
            (x_train, y_train), (x_test,
                                 y_test) = self.db(self.source).load_data()

        print("Training label shape: ",
              y_train.shape)  # (60000,) -- 60000 numbers (all 0-9)
        print("First 5 training labels: ", y_train[:5])  # [5, 0, 4, 1, 9]

        # Flatten the images
        train_count = x_train.shape[0]
        test_count = x_test.shape[0]
        image_size = 0
        image_vector_size = 0
        if len(x_train.shape) >= 2:
            image_size = x_train.shape[1]
            image_vector_size = image_size * image_size

        if len(x_train.shape) == 4:
            channels = x_train.shape[3]
        else:
            channels = 1

        if self.source in ['mnist', 'cifar10', 'cifar100']:
            self.x_train = x_train
            self.x_test = x_test
            self.y_train_label = y_train.copy()
            self.y_test_label = y_test.copy()

            self.x_train1d = x_train.reshape(train_count, image_vector_size)
            self.x_test1d = x_test.reshape(test_count, image_vector_size)

            self.x_train2d = x_train.reshape(-1, image_size, image_size,
                                             channels)
            self.x_test2d = x_test.reshape(-1, image_size, image_size,
                                           channels)

            # Convert to "one-hot" vectors using the to_categorical function
            self.num_classes = self.db_categories(self.source)
            self.y_train = keras.utils.to_categorical(y_train,
                                                      self.num_classes)
            self.y_test = keras.utils.to_categorical(y_test, self.num_classes)
        elif self.source == 'reuters':
            self.num_classes = np.max(y_train) + 1
            max_words = 10000
            # Vectorizing sequence data
            tokenizer = Tokenizer(num_words=max_words)
            self.x_train = tokenizer.sequences_to_matrix(x_train,
                                                         mode='binary')
            self.x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
            print('x_train shape:', x_train.shape)
            print('x_test shape:', x_test.shape)
            # Convert class vector to binary class matrix
            self.y_train = keras.utils.to_categorical(y_train,
                                                      self.num_classes)
            self.y_test = keras.utils.to_categorical(y_test, self.num_classes)

        print("First 5 training labels as one-hot encoded vectors:\n",
              y_train[:5])

Example #5

Show file

File: old.py Project: scotthbran/TextClassification

                             test_labels) = reuters.load_data(num_words=10000)

print(train_data)
print(train_labels)
print(test_data)
print(test_labels)

print('done')

word_index = reuters.get_word_index()

num_classes = max(train_labels) + 1
max_words = 10000

token = Tokenizer(max_words)
train_data = token.sequences_to_matrix(train_data)
test_data = token.sequences_to_matrix(test_data)

train_labels = to_categorical(train_labels, num_classes)
test_labels = to_categorical(test_labels, num_classes)

model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])