def estimate_phase_dev(cell, temperature): """ Final estimate for the circadian phase deviation. Parameters ---------- cell : string Cell condition. temperature : int Temperature condition. Returns ------- The standard deviation for the phase progression, and the periods. """ ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C ######### print('CAUTION : Parameters for None temperature selected since not enough \ traces at 34°C and 40°C') temperature = None ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) #print(len(ll_area)) std, std_T = estimate_phase_dev_from_signal(ll_peak) elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = True) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) std, std_T = estimate_phase_dev_from_signal(ll_peak) #correction for the neglected coupling since dividing traces std = std*0.65 std_T = std_T *0.65 else: print("Cell type doesn't exist") ''' for (idx, l_signal), l_peak in zip(enumerate(ll_signal), ll_peak): plt.plot(l_signal) plt.plot(l_peak) plt.show() plt.close() if idx>17: break ''' return std, std_T
def train(): batch_size = 10 epochs = 50 bestloss = 1e10 learning_rate = 5e-4 Trainer = VGG16Trainer().cuda() path = './train' trainLabel = getLabel(path) traindata = LoadData(path, Label=trainLabel) dataloader = DataLoader(traindata, batch_size, shuffle=True) valLabel = getLabel('./val') valdata = LoadData('./val', Label=valLabel) valdataloader = DataLoader(valdata, batch_size, shuffle=True) count = 0 for epoch in range(epochs): if count == 5: learning_rate *= 0.5 for param_group in Trainer.optimizer.param_groups: param_group['lr'] = learning_rate if count == 10: break Trainer.train() totalloss = 0 for i_batch, batch_data in enumerate(dataloader): image = batch_data['image'] label = batch_data['label'].cuda() image = image.cuda().float() / 255. loss = Trainer.train_step(image, label) totalloss += loss print('train loss:') print(totalloss / len(dataloader)) Trainer.eval() valloss = 0 with torch.no_grad(): for i_batch, batch_data in enumerate(valdataloader): image = batch_data['image'] label = batch_data['label'].cuda() image = image.cuda().float() / 255. valloss += Trainer.forward(image, label) print('val loss:') valloss_a = valloss / len(valdataloader) print(valloss_a) if valloss_a < bestloss: bestloss = valloss_a print('saved') Trainer.save('VGG.pkl') count = 0 else: count += 1
def estimate_OU_par(cell, temperature, W=None, gamma_A=0.03, gamma_B=0.03): """ Estimate mean and variance of OU processes given a set of conditions, according to which a set of traces is filtered. Parameters ---------- cell : string Cell type. temperature : integer Temperature condition. W : list Waveform. gamma_A : float Regression parameter for the amplitude. gamma_b : float Regression parameter for the background. Returns ------- The mean and standard deviations of the amplitude and the background. """ ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C ######### print( 'CAUTION : Parameters for None temperature selected since not enough \ traces at 34°C and 40°C') temperature = None ##################### LOAD DATA ################ if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=False) elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=True) try: (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) except: dataClass.path = '../' + dataClass.path (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) return estimate_OU_par_from_signal(ll_signal, W, gamma_A, gamma_B)
def estimate_cycle_dev(cell, temperature): """ Final estimate for the cell-cycle phase deviation. Parameters ---------- cell : string Cell condition. temperature : int Temperature condition. Returns ------- The standard deviation for the phase progression, and the periods. """ ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = True) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \ ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) std, std_T = estimate_phase_dev_from_div(ll_idx_cell_cycle_start) return std, std_T
def MeanAndVarMapper(fileName): inputData = LoadData(fileName)[0] inputMat = numpy.mat(inputData) mean = numpy.mean(inputMat) var = numpy.var(inputData) count = len(inputData) return mean, var, count
def predict(self): """ Retruns the forecasts of the run function """ #import pdb; pdb.set_trace() #classifier = self.optimization()[1] self.optimization() classifier = pickle.load(open('best_model3.pkl', 'rb')) predict_model = theano.function( inputs=[classifier.input], outputs=classifier.LinearRegression.y_pred) # We can test it on some examples from test test data = LoadData(self.link) datasets = data.load_data() #import pdb; pdb.set_trace() x_test, y_test = datasets[2] predicted_values = predict_model(x_test.get_value()) fig = figure() _ = plt.scatter(x_test.get_value(), predicted_values, c='red', label='Predicted Values') _ = plt.scatter(x_test.get_value(), y_test.get_value(), facecolors='none', edgecolors='r', label='Sample Points') _ = plt.legend() #plt.show() return fig
def train_n_classes(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200, 300] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 5, 'Test size:', math.ceil(train_size * 0.25) * 5) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) classes = {} x = 0 for i in self.data['class_name']: if i not in classes: classes[i] = x x += 1 X_train = vect.fit_transform(self.train_data['data']) Y_train = [classes[i] for i in self.train_data['class_name']] X_test = vect.transform(self.test_data['data']) Y_test = [classes[i] for i in self.test_data['class_name']] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')
def main(): """ This is the main program of the project. It calls all functions to get the result and shows it to the user. """ try: yelp = LoadData() user = UserChoice() choice = user.get_user_input() plots = PlotVisualization(yelp.get_data()) h = Html() # Output result to html if choice == 'quit': print "Quitting..." pass elif choice == "overview": plots.plot_overview() print "Overview only." h.output_to_file(False) else: plots.plot_search_results(choice) print 'Your choice of restaurants received.' h.output_to_file(True) except ValueError: print "Found value error." sys.exit() except KeyboardInterrupt: print "Interrupted!" sys.exit() except MemoryError: print "Memory Error" sys.exit()
def main(): # Data loading arg_parser = CSACCMArgs() args = arg_parser.args os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) data = LoadData(args.path, args.dataset, label=args.label, sep=args.sep, append_id=True, include_id=False) if args.verbose > 0: print(args) # Training t1 = time() model = CSACCM(feature_num=len(data.features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size, user_feature_num=len(data.user_features), item_feature_num=len(data.item_features), user_num=data.user_num, item_num=data.item_num, ui_vector_size=args.ui_vector_size, warm_ratio=args.warm_ratio, cb_hidden_layers=eval(args.cb_hidden_layers), attention_size=args.attention_size, optimizer=args.optimizer, learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch, dropout_keep=args.dropout_keep, l2_regularize=args.l2, verbose=args.verbose, random_seed=args.random_seed, model_path=args.model) if args.load == 1: model.load_model() # model.run_debug(data.test_data) # train model.train(data.train_data, data.validation_data, data.test_data, args.load == 1) model.print_result(t1) # test model.load_model() model.predict(data.test_data)
def invoke_data_summary(self): # Instantiate classes # Load Data from pickle files ld = LoadData(self.my_variables.training_file, self.my_variables.testing_file, self.my_variables.validation_file) train_test_valid_data = ld.get_data() ######################################################################################################### self.x_train, self.y_train = train_test_valid_data[ 0], train_test_valid_data[1] self.x_test, self.y_test = train_test_valid_data[ 2], train_test_valid_data[3] self.x_valid, self.y_valid = train_test_valid_data[ 4], train_test_valid_data[5] ######################################################################################################### # Basic Summary of dataset self.bs.summary_report(self.x_train, self.y_train, self.x_test, self.y_test, self.x_valid, self.y_valid) ######################################################################################################### # Exploratory visualization for train data self.vz.bar_chart(self.y_train, "train_data") # Exploratory visualization for train data self.vz.bar_chart(self.y_test, "test_data") # Exploratory visualization for train data self.vz.bar_chart(self.y_valid, "validation_data") ######################################################################################################### self.vz.read_sign_names_from_csv(self.my_variables) self.vz.display_random_images(self.x_train, self.y_train, self.my_variables, "train")
def __init__(self, tickers, start='2014-01-01', end='2018-01-01', interval='1d', n_series=20, T_pred=10, n_cols=30, n_rows=30, T_space=10, train=True): self.folder = './' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + \ '/case' + str(n_series) + '_' + str(T_pred) + '_' + str(n_cols) + '_' + str(n_rows) + '_' + str(T_space) try: self.original = np.load(self.folder + '/original.npy') if train: self.x = np.load(self.folder + '/Xtrain.npy') self.y = np.load(self.folder + '/Ytrain.npy') else: self.x = np.load(self.folder + '/Xtest.npy') self.y = np.load(self.folder + '/Ytest.npy') except: ld = LoadData(tickers, start, end, interval) try: ld.unprocessed = pd.read_csv('./' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + '/UnprocessedData.csv') except: print('DOWNLOADING DATA') ld.download() print('PROCESSING DATA') ld.process(n_series, T_pred, n_cols, n_rows, T_space, plot=True) ld.cut_and_shuffle() if train: self.x = ld.Xtrain self.y = ld.Ytrain else: self.x = ld.Xtest self.y = ld.Ytest self.original = ld.original # Shape of X: (Number of datasamples, Number of tickers, Number of rows, Number of columns) # Shape of Y: (Number of datasamples, Number of tickers) self.len = self.x.shape[0]
def main(): # Data loading arg_parser = BaseArgs() args = arg_parser.args os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) data = LoadData(args.path, args.dataset, label=args.label, append_id=False, include_id=True) if args.verbose > 0: print(args) # Training t1 = time() model = BaseModel(feature_num=len(data.features), optimizer=args.optimizer, learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch, dropout_keep=args.dropout_keep, l2_regularize=args.l2, verbose=args.verbose, random_seed=args.random_seed, model_path=args.model) if args.load == 1: model.load_model() # model.run_debug(data.test_data) # train model.train(data.train_data, data.validation_data, data.test_data, args.load == 1) model.print_result(t1) # test model.load_model() model.predict(data.test_data)
def feature_extraction_NFM(X, y_binary, i_Pos_sample_set, i_Neg_sample_set, args): #the labeled data will come from both pos and neg X_label = [X[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set)] y_label = [ y_binary[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set) ] X_train = np.asarray(X_label) Y_train = np.asarray(y_label) X_validation = np.asarray(X_label) Y_validation = np.asarray(y_label) X_test = copy.deepcopy( X ) #set the whole dataset for test, so that we will get the new features Y_test = copy.deepcopy(y_binary) data = LoadData(args.loss_type, X_train, Y_train, X_validation, Y_validation, X_test, Y_test) # Training model = NeuralFM(data.features_M, args.hidden_factor, args.layers, args.loss_type, args.pretrain, args.epoch, args.batch_size, args.lr, args.lamda, args.keep_prob, args.optimizer, args.batch_norm, args.activation_function, args.verbose, args.early_stop) model.train(data.Train_data, data.Validation_data, data.Test_data) features = model.get_deep_feature( data.Test_data) #model.get_bi_feature(data.Test_data)# return features
def show_no_temperature_difference(): """ Plot the estimate of the phase at different temperatures. """ l_var = [] l_var_var = [] l_temp = [34,37,40] for temperature in l_temp: path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \ ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) var,var_var = compute_phase_variance_with_confidence(ll_peak) l_var.append(var) l_var_var.append(var_var) plt.errorbar(l_temp, l_var, yerr = l_var_var, fmt='o') plt.xlim([33,41]) plt.xlabel("Temperature") plt.ylabel("Phase diffusion variance mean and deviation") plt.savefig('Results/RawData/var_diffusion.pdf') plt.show() plt.close()
def compute_likelihood_sigma(): """ Compute and plot the likelihood of the phase diffusion parameter, depending on the temperature. """ l_T = [34,37,40] l_likelihood_T = [] mean_IG = 24 domain_sigma = np.linspace(0.05, 0.3, 100) for T in l_T: path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = T, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) l_T_clock=[] for l_peak in ll_peak: l_idx_peak = [idx for idx, i in enumerate(l_peak) if i==1] for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]): #remove outliers T = (t_peak_2-t_peak_1)/2 if T<12 or T>38: #double or missing annotation pass else: l_T_clock.append(T ) l_likelihood = [] for sigma_theta in domain_sigma: lpx = np.log(1/sigma_theta) #lpx = 1 for T in l_T_clock: lpx = lpx + np.log(invgauss(T, mean_IG, 4*np.pi**2/sigma_theta**2)) l_likelihood.append( lpx/len(l_T_clock)) l_likelihood_T.append(l_likelihood) plt.plot(domain_sigma,l_likelihood_T[0], c = 'red', label = '34' ) plt.plot(domain_sigma,l_likelihood_T[1], c = 'blue',label = '37' ) plt.plot(domain_sigma,l_likelihood_T[2], c = 'orange',label = '40' ) plt.axvline(domain_sigma[np.argmax(l_likelihood_T[0])], c= 'red') plt.axvline(domain_sigma[np.argmax(l_likelihood_T[1])], c= 'blue') plt.axvline(domain_sigma[np.argmax(l_likelihood_T[2])], c= 'orange') plt.ylabel(r'$log(L(\sigma_\theta))$') plt.xlabel(r'$\sigma_\theta$' ) plt.legend() plt.savefig('Results/RawData/likelihood_sigma_theta.pdf') plt.show() plt.close()
def __init__(self): self.rawdata = LoadData('.\ex1data1.txt') self.X, self.y, self.batch_size = self.rawdata.loadTXT() self.theta = np.array([[0.], [0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] print(self.theta.shape) print(self.batch_size) print(np.sum(self.X[:, 1])) print(np.sum(self.y))
def train(self): l = LoadData() stopWords = l.loadStopWords() self.loadDataCSV('bbc-text.csv') vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = self.train_data['class_name'] X_test = vect.transform(self.test_data['data']) Y_test = self.test_data['class_name'] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred)
def load_model_and_predict(data=None): transaction_classifier = TransactionClassifier() transaction_classifier.load_model() if data is None: data = LoadData(x_input_features=[5, 6, 7]) data.load_processed_data() val_data = data.load_validation_data() test_data = data.load_test_data() print(transaction_classifier.get_confusion_matrix(val_data)) print(transaction_classifier.get_confusion_matrix(test_data))
def __init__(self): self.rawdata = LoadData('.\ex2data1.txt') self.X, self.y, self.batch_size, rawdata = self.rawdata.loadTXT() self.theta = np.array([[0.],[0.],[0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] self.rawdata_p = rawdata[np.where(rawdata[:,2]==1)] self.rawdata_n = rawdata[np.where(rawdata[:,2]==0)] # print(self.theta.shape) # print(self.batch_size) # print(self.rawdata_p) # print(self.rawdata_n) self.y = self.y[0]
def plot_hist_periods(cell, temperature, division): """ Given a cell condition, compute and plot a histgram of periods. Parameters ---------- cell : string Cell condition. """ ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "../Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" else: path = "../Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=division) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) \ = dataClass.load(load_annotation=True) if division: ##################### COMPUTE CELL CYCLE DISTRIBUTION ################## l_T_cell_cycle = [] for l_div_index in ll_idx_cell_cycle_start: for t1, t2 in zip(l_div_index[:-1], l_div_index[1:]): l_T_cell_cycle.append((t2 - t1) / 2) ##################### COMPUTE CIRCADIAN CLOCK DISTRIBUTION ################# l_T_clock = [] for l_peak in ll_peak: l_idx_peak = [idx for idx, i in enumerate(l_peak) if i == 1] for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]): l_T_clock.append((t_peak_2 - t_peak_1) / 2) ##################### PLOT BOTH DISTRIBUTIONS ################## bins = np.linspace(8, 38, 40) if division: plt.hist(l_T_cell_cycle, bins, alpha=0.5, label='cell-cycle') plt.hist(l_T_clock, bins, alpha=0.5, label='clock') plt.legend(loc='upper right') if division: plt.savefig('../Results/RawData/Distributions_div_'+str(temperature)\ +"_"+cell+'.pdf', bbox_inches='tight') else: plt.savefig('../Results/RawData/Distributions_nodiv_'+str(temperature)\ +"_"+cell+'.pdf', bbox_inches='tight') plt.close()
def __init__(self): self.headerFiles={ "abslogout": "conf/wifi_abslogout_header", "login": "******", "auth": "conf/wifi_auth_header", "logout": "conf/wifi_logout_header" } self.paramFiles={ "abslogout": "conf/wifi_abslogout_params", "login": "******", "auth": "conf/wifi_auth_params", "logout": "conf/wifi_logout_params" } self.outFiles = { "abslogout": "out/abslogout.txt", "login": "******", "auth": "out/auth.txt", "logout": "out/logout.txt" } ld = LoadData() urls = ld.loadUrls('conf/urls') self.urls={ "abslogout":urls['abslogout'], "login": urls['login'], "logout": urls['logout'], "auth": urls['auth'] } edatas = ld.loadParams('conf/email',split="=") self.eml = MyEmail() self.eml.setUser(edatas['msg_from'],edatas['msg_to'],edatas['passwd']) # self.absLogoutUrl = "http://ipgw.neu.edu.cn/include/auth_action.php" # self.loginUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&" # self.logoutUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&" # self.authUrl = "http://ipgw.neu.edu.cn/include/auth_action.php?" self.ld = ld if not os.path.exists("out"): os.mkdir('out') if not os.path.exists('conf'): print("配置文件损坏,无法运行,请自行查看代码修复!很容易")
def optimize(self): """ Doing the actual optimiziation """ batch_size, finetune_lr = self.batch_size, self.finetune_lr batch_size, n_epochs = self.batch_size, self.training_epochs data = LoadData(self.link) datasets = data.load_data() train_set_x = datasets[0][0] n_train_batches = train_set_x.get_value( borrow=True).shape[0] // batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, output_layer=LinearRegression, n_ins=1, hidden_layers_sizes=[3, 3], n_outs=1) # Pretraining pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) self.__pretraining(dbn.n_layers, pretraining_fns, n_train_batches) # Backpropagation train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr) models = (train_fn, validate_model, test_model) finetuning = Optimization(batch_size=batch_size, n_epochs=n_epochs) finetuning.Backpropagation(models, datasets) test = theano.function(inputs=[dbn.x], outputs=dbn.output_layer.y_pred) prediction = test(datasets[2][0].get_value()) import pdb pdb.set_trace() return dbn
def loadData(self): l = LoadData() data_cluster = l.loadData('wine-clustering.csv') self.std_scaler = StandardScaler() self.min_max_scaler = MinMaxScaler() data_cluster[data_cluster.columns] = self.std_scaler.fit_transform( data_cluster) data_cluster[data_cluster.columns] = self.min_max_scaler.fit_transform( data_cluster) # print(data_cluster.mean()) # data = data_cluster.to_numpy() # np.savetxt('data.txt', data, fmt='%.1f') # coverience_matrix = np.dot(np.transpose(data), # data) / (data.shape[1] - 1) # np.savetxt('matrix.txt', coverience_matrix, fmt='%.1f') # pca self.pca_2 = PCA(2) self.pca_2_result = self.pca_2.fit_transform(data_cluster) self.data = data_cluster
def train_and_get_metrics(): print('LOADING DATA') # Load Data data = LoadData(x_input_features=[5, 6, 7]) data.load_processed_data() train_data = data.load_train_data() validation_data = data.load_validation_data() test_data = data.load_test_data() print('TRAINING CLASSIFIER') # Run Model transaction_classifier = TransactionClassifier() transaction_classifier.train(train_data) print('TESTING CLASSIFIER') train_results = transaction_classifier.test(train_data) val_results = transaction_classifier.test(validation_data) test_results = transaction_classifier.test(test_data) print('STATS:') print('------------ TRAIN SET -------------') print('LENGTH =', len(train_data[0])) print('Metrics:\n', train_results) print('------------ VALIDATION SET -------------') print('LENGTH =', len(validation_data[0])) print('Metrics:\n', val_results) print('------------ TEST SET -------------') print('LENGTH =', len(test_data[0])) print('Metrics:\n', test_results) print('++++++++++++++++++++++++++++++++++++++++') print('SAVING CLASSIFIER') transaction_classifier.save_model()
def train_1_class(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 2, 'Test size:', math.ceil(train_size * 0.25) * 2) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) # balance classes temp_class = self.data['class_name'][train_size:] temp_data = self.data['data'][train_size:] idx = random.choices(range(len(temp_class)), k=train_size) temp_class = [temp_class[i] for i in idx] temp_data = [temp_data[i] for i in idx] del self.data['data'][train_size:] del self.data['class_name'][train_size:] self.data['class_name'].extend(temp_class) self.data['data'].extend(temp_data) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = [ 1 if i == 'business' else 0 for i in self.train_data['class_name'] ] X_test = vect.transform(self.test_data['data']) Y_test = [ 1 if i == 'business' else 0 for i in self.test_data['class_name'] ] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')
def generate_report(self, input_file_name, output_file_name, athlete_name, is_running_file, date): """ :param input_file_name: Name of the csv file containing data :param output_file_name: Name of the tex file that is generated :param athlete_name: Name of the athlete separated by an underscore :param is_running_file: Boolean, either true or false :param date: The date associated with the file in format 'yyyy_mm_dd' :return: Nothing Generate the tex file that will later be compiled into a pdf. """ data = LoadData().load_csv(input_file_name) stats = StatisticsAndPlots() self.build_tex(output_file_name, athlete_name, is_running_file, ("%.2f" % stats.get_average_speed(data)), ("%.2f" % stats.get_average_heart_rate(data)), ("%.2f" % stats.get_average_cadence(data)), (None if is_running_file else "%.2f" % stats.get_average_power(data, is_running_file)), stats.make_pace_histogram(data, date, athlete_name), stats.make_zone_histogram(data, date, athlete_name), stats.make_raw_plot(data, date, athlete_name), date) self.number_of_files_created += 1
filename2 = "/workspace/data/labels1.txt" counter = 1 start = time.time() filename = filename[:-4 - len(str(counter - 1))] + str(counter) + filename[-4:] data = np.memmap('newdata.array', dtype=np.float64, mode='w+', shape=(1300000, 100, 40, 3)) print("[INFO] Loading first file... ") with open(filename, "r") as file: data_temp = LoadData(file) length = data_temp.shape[0] data[0:length] = data_temp counter = counter + 1 end = time.time() elapsed = end - start print("[INFO] Finished loading first file, elapsed time: " + str(elapsed)) print("[INFO] data shape: " + str(data.shape)) print("[INFO] length: " + str(length)) while 1: print("[INFO] Loading file " + str(counter) + " ...") start = time.time()
import sys sys.path.append("../Basic Functions") import AdaBoost from LoadData import LoadData if __name__ == '__main__': trainingDataArray, trainingLabelList = LoadData("HorseColicTraining.txt") classifierList, totalPredictValue = AdaBoost.AdaboostTrain( trainingDataArray, trainingLabelList, 10) testDataArray, testLabelList = LoadData("HorseColicTest.txt") result = AdaBoost.AdaClassify(testDataArray, classifierList) errorList = [ i for i in range(len(testLabelList)) if testLabelList[i] != result[i] ] print(errorList) AUC = AdaBoost.PlotROC(trainingLabelList, totalPredictValue) print(AUC)
def load_data_from_pickle_file(self): self.grid, self.entry_coords, self.exit_coords, self.solution_path, self.path_value, self.obstacle_value = LoadData( self.filename).load_data_from_pickle_file() if self.display_maze: self.show_maze()
def load_grid_from_csv(self): self.grid = LoadData(self.filename).load_grid_from_csv() if self.display_maze: self.show_maze()