def tokenize(self, writeDictionaryToCsv=False): print('Tokenizing') print('adding n-grams') self.trainData = self.addNgGrams(self.trainData) self.testData = self.addNgGrams(self.testData) all_reviews = self.trainData.append(self.testData) tokenizer = Tokenizer(num_words=30000) print('fitting') tokenizer.fit_on_texts(all_reviews) tokenizer.fit_on_sequences(all_reviews) print('texts_to_sequences') self.trainData = tokenizer.texts_to_sequences(self.trainData) self.testData = tokenizer.texts_to_sequences(self.testData) print('sequences_to_matrix') self.trainData = tokenizer.sequences_to_matrix(self.trainData) self.testData = tokenizer.sequences_to_matrix(self.testData) all_reviews = np.vstack((self.trainData, self.testData)) #does not work with svm no time to change code self.trainData = self.trainData / self.trainData.sum(axis=1)[:, None] self.testData = self.testData / self.testData.sum(axis=1)[:, None] if (writeDictionaryToCsv): self.ExportFeatureSpace(tokenizer) print('Finished tokenizing')
def closure(mu): (x_train, y_train), (_, _) = imdb.load_data() tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_sequences(x_train) x_train = tokenizer.sequences_to_matrix(x_train, "tfidf") # Note: svd_solver=full is needed on GPU server x_train = PCA(n_components=100, svd_solver='full').fit_transform(x_train) ds = {"data": x_train, "target": y_train} # Apply noise and return res = preprocess_and_noise(dataset=ds, mu=mu) return res