def __create_sequences(self, X, labels, sampling=True): sequences = self.tokenizer.texts_to_sequences(X) data = pad_sequences(sequences, padding='post', maxlen=self.params['max_length']) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] if sampling: sample = Sampling(2., .5) x_train, y_train = sample.perform_sampling(data, labels, [0, 1]) else: x_train, y_train = data, labels return x_train, y_train
sequences = tokenizer.texts_to_sequences(X) word_index = tokenizer.word_index data = pad_sequences(sequences, padding='post', maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) sample = Sampling(2., .5) x_train, y_train = sample.perform_sampling(data[:-num_validation_samples], labels[:-num_validation_samples], [0, 1]) x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] print('Number of entries in each category:') print('training: ', y_train.sum(axis=0)) print('validation: ', y_val.sum(axis=0)) model = Word2Vec.load('1ft.modelFile') embeddings_index = {} embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = model.wv[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector