def __create_sequences(self, X, labels, sampling=True):
        sequences = self.tokenizer.texts_to_sequences(X)

        data = pad_sequences(sequences, padding='post',
                             maxlen=self.params['max_length'])

        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]

        if sampling:
            sample = Sampling(2., .5)
            x_train, y_train = sample.perform_sampling(data, labels, [0, 1])
        else:
            x_train, y_train = data, labels

        return x_train, y_train
Example #2
0
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index

data = pad_sequences(sequences, padding='post', maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
sample = Sampling(2., .5)
x_train, y_train = sample.perform_sampling(data[:-num_validation_samples],
                                           labels[:-num_validation_samples],
                                           [0, 1])
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
print('Number of entries in each category:')
print('training: ', y_train.sum(axis=0))
print('validation: ', y_val.sum(axis=0))

model = Word2Vec.load('1ft.modelFile')

embeddings_index = {}
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = model.wv[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector