def __init_data__(self, args): ''' Initialize preprocessing from raw dataset to dataset split into training and testing Training and test datasets are index strings that refer to tokens ''' self.preprocessing = Preprocessing(args) self.preprocessing.load_data() self.preprocessing.prepare_tokens() raw_x_train = self.preprocessing.x_train raw_x_test = self.preprocessing.x_test self.y_train = self.preprocessing.y_train self.y_test = self.preprocessing.y_test self.x_train = self.preprocessing.sequence_to_token(raw_x_train) self.x_test = self.preprocessing.sequence_to_token(raw_x_test)
def test_standardization(self): X_train = np.array([[1,2], [2,4], [3,6]]) X_test = np.array([[4,8], [5,10]]) pre_X_train, pre_X_test, _ = Preprocessing.standardization(X_train, X_test, mode='zscore') print() print("test_standardization ===========================") print('X_train => \n', X_train) print('X_test => \n', X_test)
def test_prep_missing_val(self): X_train = np.array([[np.nan, 1], [2,3], [np.nan, np.nan]]) X_test = np.array([[np.nan, 1], [2,3]]) y_train = np.array([1,2,3]) y_test = np.array([4,5]) pre_X_train, pre_X_test, pre_y_train, pre_y_train = Preprocessing.prep_missing_val(X_train, X_test, y_train, y_test, mode='remove') print() print('test_prep_missing_val ==========================') print('X_train => ', X_train, 'y_train => ', y_train) print('pre_X_train => ', pre_X_train, 'pre_X_test => ', pre_X_test)
def normalizeData(inputDataClass): ######################################## Normalising Data #################################### normalizer = Preprocessing.Normalise() inputDataClass.Train = np.hstack( (normalizer.scale(inputDataClass.Train[:, :-1], train=True), inputDataClass.Train[:, -1].reshape(-1, 1))) inputDataClass.Test = np.hstack( (normalizer.scale(inputDataClass.Test[:, :-1], train=False), inputDataClass.Test[:, -1].reshape(-1, 1)))
def var_vs_comp(X, start, stop, step): print("Making variance v/s components plot. . . ") components = [] variances = [] d = X.shape[1] i_cols = np.arange(start, stop, step) for k in i_cols: pca = Preprocessing.PCA(X, k = k, whiten = False) components.append(k) variances.append(pca.var_retained) plt.plot(components, variances) plt.ylabel('variance retained') plt.xlabel('number of components') plt.show()
def performPCA(inputDataClass, reduced_columns): ############################################## PCA Visualisation ############################################# # #variance v/s n_components : Fashion MNIST # start = 10 # stop = 500 # step = 15 # Visualization.var_vs_comp(inputDataClass.Train[:,:-1], start, stop, step) ########################################################### PCA ############################################# ##### Our PCA #### pca = Preprocessing.PCA(inputDataClass.Train[:, :-1], k=reduced_columns, whiten=False) ##### Hyperparameter #### reduced_train = pca.reduce(inputDataClass.Train[:, :-1], True) inputDataClass.Train = np.hstack( (reduced_train, inputDataClass.Train[:, -1].reshape(-1, 1))) print("train_data reduced.") print("Train data reduced to columns = " + str(reduced_train.shape[1])) reduced_test = pca.reduce(inputDataClass.Test[:, :-1], False) inputDataClass.Test = np.hstack( (reduced_test, inputDataClass.Test[:, -1].reshape(-1, 1))) print("test_data reduced. ") print("Test data reduced to columns = " + str(reduced_test.shape[1]))
# All components of DataGenerator are: # load file, preprocessing, augmentation, batch creation, encoder output inputs_file_loader = dict(format=img_format) file_loader = FileLoader(**inputs_file_loader) # Generator of inputs for the model inputs_model = InputsModel(model_name=model_name) batch_creator = inputs_model.create_batch # Define preprocessing # preprocessing inputs target_size = 250 preprocessing = Preprocessing([ ("rescale", dict(target_size=target_size)), ]) # Define augmentation augmentation = None #Augmentation() encoder_output = EncoderOutput( order_output_model=order_output_model, encode_labels= encode_labels # dict to map labels in 'data/labels.json' to other classes ) # shared configuration for both train and test data generators config_generator = dict(labels=labels, file_loader=file_loader, batch_creator=batch_creator,
def prepare_data(num_words, seq_len): # Preprocessing pipeline pr = Preprocessing(num_words, seq_len) pr.load_data() pr.clean_text() pr.text_tokenization() pr.build_vocabulary() pr.word_to_idx() pr.padding_sentences() pr.split_data() return { 'x_train': pr.x_train, 'y_train': pr.y_train, 'x_test': pr.x_test, 'y_test': pr.y_test }
class Execute: ''' Class for execution. Initializes the preprocessing as well as the Tweet Classifier model ''' def __init__(self, args): self.__init_data__(args) self.args = args self.batch_size = args.batch_size self.model = TweetClassifier(args) def __init_data__(self, args): ''' Initialize preprocessing from raw dataset to dataset split into training and testing Training and test datasets are index strings that refer to tokens ''' self.preprocessing = Preprocessing(args) self.preprocessing.load_data() self.preprocessing.prepare_tokens() raw_x_train = self.preprocessing.x_train raw_x_test = self.preprocessing.x_test self.y_train = self.preprocessing.y_train self.y_test = self.preprocessing.y_test self.x_train = self.preprocessing.sequence_to_token(raw_x_train) self.x_test = self.preprocessing.sequence_to_token(raw_x_test) def train(self): training_set = DatasetMaper(self.x_train, self.y_train) test_set = DatasetMaper(self.x_test, self.y_test) self.loader_training = DataLoader(training_set, batch_size=self.batch_size) self.loader_test = DataLoader(test_set) optimizer = optim.RMSprop(self.model.parameters(), lr=args.learning_rate) for epoch in range(args.epochs): predictions = [] self.model.train() for x_batch, y_batch in self.loader_training: x = x_batch.type(torch.LongTensor) y = y_batch.type(torch.FloatTensor) y_pred = self.model(x) loss = F.binary_cross_entropy(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() predictions += list(y_pred.squeeze().detach().numpy()) test_predictions = self.evaluation() train_accuary = self.calculate_accuray(self.y_train, predictions) test_accuracy = self.calculate_accuray(self.y_test, test_predictions) print( "Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch + 1, loss.item(), train_accuary, test_accuracy)) def evaluation(self): predictions = [] self.model.eval() with torch.no_grad(): for x_batch, y_batch in self.loader_test: x = x_batch.type(torch.LongTensor) y = y_batch.type(torch.FloatTensor) y_pred = self.model(x) predictions += list(y_pred.detach().numpy()) return predictions @staticmethod def calculate_accuray(grand_truth, predictions): true_positives = 0 true_negatives = 0 for true, pred in zip(grand_truth, predictions): if (pred > 0.5) and (true == 1): true_positives += 1 elif (pred < 0.5) and (true == 0): true_negatives += 1 else: pass return (true_positives + true_negatives) / len(grand_truth)