def pre_process_to_pickle(version='large'): """Multi-layer Perceptron is sensitive to feature scaling; this function uses MinMaxScaler to scale input features between (-1,1) Returns: scaled: train_X, val_X """ train_X, val_X, train_y, val_y = load.read_data(bert=version) train_X.date = pd.to_datetime(train_X.date).astype('int64') train_X = pd.concat([ train_X.drop('review', axis=1), pd.DataFrame(train_X.review.tolist(), dtype=np.float32) ], axis=1) val_X.date = pd.to_datetime(val_X.date).astype('int64') val_X = pd.concat([ val_X.drop('review', axis=1), pd.DataFrame(val_X.review.tolist(), dtype=np.float32) ], axis=1) cols = ['ex_id', 'user_id', 'prod_id', 'rating', 'date'] scaler = MinMaxScaler(feature_range=(-1, 1)).fit(train_X[cols]) train_X[cols] = scaler.transform(train_X[cols]) val_X[cols] = scaler.transform(val_X[cols]) path = load.get_data_path() with open(f'{path}train_{version}.pickle', 'wb') as file: pickle.dump(train_X, file) with open(f'{path}dev_{version}.pickle', 'wb') as file: pickle.dump(val_X, file) return train_X, val_X
def main(): '''loads datasets into dataframes using load.py and prints head(2) for each ''' print("reading data") train_X, val_X, train_y, val_y = load.read_data() # downsample for testing print("downsampling to head(20)") tx = train_X.head(20).copy() vx = val_X.head(20).copy() ty = train_y.head(20).copy().values.ravel() vy = val_y.head(20).copy().values.ravel() transformers = ['cv', 'tfidf', 'w2v', 'bert'] models = ['svm', 'nb', 'nn', 'lr'] save_plots = True for t in transformers: t_ = eval('kt.' + t)(tx.review) tx_ = t_.transform(tx.review.copy()) vx_ = t_.transform(vx.review.copy()) for m in models: m_ = eval('km.' + m)().fit(tx_, ty) print(f'mean accuracy {t}|{m}: {m_.score(vx_,vy)}') if save_plots: km.metrics(m_, vx_, vy, f'{t}-{m}')
def main(debug=False): '''loads datasets into dataframes using load.py and prints head(2) for each ''' if debug: print("reading data") train_X, val_X, train_y, val_y = load.read_data(debug) if debug: print("data read") print( f'train_X: cols={train_X.columns.values}\n dtype={train_X.dtypes}\n shape={train_X.shape}\n' ) print(train_X.head(2)) print( f'val_X: cols={val_X.columns.values}\n dtype={train_X.dtypes}\n shape={val_X.shape}\n' ) print(val_X.head(2)) print( f'train_y: cols={train_y.columns.values}\n dtype={train_X.dtypes}\n shape={train_y.shape}\n' ) print(train_y.head(2)) print( f'val_y: cols={val_y.columns.values}\n dtype={train_X.dtypes}\n shape={val_y.shape}\n' ) print(val_y.head(2))
def main(): train_X, val_X, train_y, val_y = load.read_data(bert='large', debug=True) # Grid Search Params param_space = { 'hidden_layer_sizes': [(255, 100, 50), (50, 50, 50), (255, ), (100, ), (50, )], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam', 'lbfgs'], 'alpha': 10.0**-np.arange(1, 4), 'learning_rate': ['constant', 'adaptive', 'invscaling'], 'max_iter': [100, 200, 300], } # Test Params -- soon to be Best Params, after I Grid Search # params = { # 'hidden_layer_sizes': (255,), # 'activation': 'relu', # 'solver': 'adam', # 'alpha': 1e-05, # 'learning_rate': 'adaptive', # 'max_iter': 300, # } # Grid search NN = searchNN(train_X, train_y, param_space) # Or single run # NN = MLPClassifier(**params).fit(train_X, train_y) path = load.get_data_path() with open(path + 'pickle_clf.pickle', 'wb') as file: pickle.dump(NN, file) # check train data y_truth, y_pred, y_prob = train_y, NN.predict(train_X), NN.predict_proba( train_X) score = NN.score(train_X, train_y) print(f"Train Score: {100 * score:.2f}") try: plots_probs = save_ROC_plot(y_prob, y_truth, score, 'train') print(f"Train ROC AUC: {plots_probs}") except: pass # check test data y_truth, y_pred, y_prob = val_y, NN.predict(val_X), NN.predict_proba(val_X) score = NN.score(val_X, val_y) print(f"Test Score: {100 * score:.2f}") try: plots_probs = save_ROC_plot(y_prob, y_truth, score, 'test') print(f"Test ROC AUC: {plots_probs}") except: pass with open(path + 'pickle_nn.pickle', 'wb') as file: pickle.dump(NN.best_estimator_, file)
# -*- coding: utf-8 -*- """ Created on Wed Mar 3 01:51:10 2021 @author: Otto """ import numpy as np import scipy import h5py import hdf5storage from numpy import pi, sqrt from load import read_data from functions import if2gputorre3d, frequency2timedomain, qam_demod data = read_data() f_min = data.variables.f_min f_max = data.variables.f_max pulse_length = data.variables.pulse_length carrier_freq = data.variables.carrier_freq spatial_scaling_constant = data.variables.spatial_scaling_constant temporal_scaling_constant = data.variables.temporal_scaling_constant mu_0 = data.variables.mu_0 eps_0 = data.variables.eps_0 c_0 = data.variables.c_0 or_radius = data.variables.or_radius or_buffer = data.variables.or_buffer s_radius = data.variables.s_radius t_shift = data.variables.t_shift reference_amplitude = data.variables.reference_amplitude time_series_count = data.variables.time_series_count t_1 = data.variables.t_1
def load(self): self.print_debug("loading data from file") self.train_X, self.val_X, self.train_y, self.val_y = load.read_data( self.debug)