def invoke_data_summary(self): # Instantiate classes # Load Data from pickle files ld = LoadData(self.my_variables.training_file, self.my_variables.testing_file, self.my_variables.validation_file) train_test_valid_data = ld.get_data() ######################################################################################################### self.x_train, self.y_train = train_test_valid_data[ 0], train_test_valid_data[1] self.x_test, self.y_test = train_test_valid_data[ 2], train_test_valid_data[3] self.x_valid, self.y_valid = train_test_valid_data[ 4], train_test_valid_data[5] ######################################################################################################### # Basic Summary of dataset self.bs.summary_report(self.x_train, self.y_train, self.x_test, self.y_test, self.x_valid, self.y_valid) ######################################################################################################### # Exploratory visualization for train data self.vz.bar_chart(self.y_train, "train_data") # Exploratory visualization for train data self.vz.bar_chart(self.y_test, "test_data") # Exploratory visualization for train data self.vz.bar_chart(self.y_valid, "validation_data") ######################################################################################################### self.vz.read_sign_names_from_csv(self.my_variables) self.vz.display_random_images(self.x_train, self.y_train, self.my_variables, "train")
def show_no_temperature_difference(): """ Plot the estimate of the phase at different temperatures. """ l_var = [] l_var_var = [] l_temp = [34,37,40] for temperature in l_temp: path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \ ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) var,var_var = compute_phase_variance_with_confidence(ll_peak) l_var.append(var) l_var_var.append(var_var) plt.errorbar(l_temp, l_var, yerr = l_var_var, fmt='o') plt.xlim([33,41]) plt.xlabel("Temperature") plt.ylabel("Phase diffusion variance mean and deviation") plt.savefig('Results/RawData/var_diffusion.pdf') plt.show() plt.close()
def train_n_classes(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200, 300] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 5, 'Test size:', math.ceil(train_size * 0.25) * 5) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) classes = {} x = 0 for i in self.data['class_name']: if i not in classes: classes[i] = x x += 1 X_train = vect.fit_transform(self.train_data['data']) Y_train = [classes[i] for i in self.train_data['class_name']] X_test = vect.transform(self.test_data['data']) Y_test = [classes[i] for i in self.test_data['class_name']] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')
def main(): """ This is the main program of the project. It calls all functions to get the result and shows it to the user. """ try: yelp = LoadData() user = UserChoice() choice = user.get_user_input() plots = PlotVisualization(yelp.get_data()) h = Html() # Output result to html if choice == 'quit': print "Quitting..." pass elif choice == "overview": plots.plot_overview() print "Overview only." h.output_to_file(False) else: plots.plot_search_results(choice) print 'Your choice of restaurants received.' h.output_to_file(True) except ValueError: print "Found value error." sys.exit() except KeyboardInterrupt: print "Interrupted!" sys.exit() except MemoryError: print "Memory Error" sys.exit()
def estimate_cycle_dev(cell, temperature): """ Final estimate for the cell-cycle phase deviation. Parameters ---------- cell : string Cell condition. temperature : int Temperature condition. Returns ------- The standard deviation for the phase progression, and the periods. """ ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = True) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, \ ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) std, std_T = estimate_phase_dev_from_div(ll_idx_cell_cycle_start) return std, std_T
def predict(self): """ Retruns the forecasts of the run function """ #import pdb; pdb.set_trace() #classifier = self.optimization()[1] self.optimization() classifier = pickle.load(open('best_model3.pkl', 'rb')) predict_model = theano.function( inputs=[classifier.input], outputs=classifier.LinearRegression.y_pred) # We can test it on some examples from test test data = LoadData(self.link) datasets = data.load_data() #import pdb; pdb.set_trace() x_test, y_test = datasets[2] predicted_values = predict_model(x_test.get_value()) fig = figure() _ = plt.scatter(x_test.get_value(), predicted_values, c = 'red', label='Predicted Values') _ = plt.scatter(x_test.get_value(), y_test.get_value(), facecolors='none', edgecolors='r', label='Sample Points') _ = plt.legend() #plt.show() return fig
def __init__(self, src, dst, city, dimension, adjacency_list, heuristic_matrices): self.__src = src self.__dst = dst self.__city = city self.__open_list = [] self.__closed_list = [] self.__dimension = self.__dimension_index(dimension) self.__org_node = LoadData.load_org_node(city) self.__org_path = LoadData.load_org_path(city, 6) self.__link_table = adjacency_list self.__heuristic = heuristic_matrices # self.__heuristic = self.__calculate_heuristic() self.__dimension_cost, self.__heuristic_cost, self.__total_cost = ({} for _ in range(3)) self.__dimension_cost = {node: float("inf") for node in self.__org_node.keys()} self.__dimension_cost[self.__src] = 0 self.__heuristic_cost[self.__src] = self.__heuristic.loc[self.__src, self.__dst] self.__total_cost[self.__src] = \ self.__heuristic_cost[self.__src] + self.__heuristic_cost[self.__src] self.__previous_node = {self.__src: None} heapq.heappush(self.__open_list, (self.__total_cost[self.__src], self.__src))
def predict(self): """ Retruns the forecasts of the run function """ #import pdb; pdb.set_trace() #classifier = self.optimization()[1] self.optimization() classifier = pickle.load(open('best_model3.pkl', 'rb')) predict_model = theano.function( inputs=[classifier.input], outputs=classifier.LinearRegression.y_pred) # We can test it on some examples from test test data = LoadData(self.link) datasets = data.load_data() #import pdb; pdb.set_trace() x_test, y_test = datasets[2] predicted_values = predict_model(x_test.get_value()) fig = figure() _ = plt.scatter(x_test.get_value(), predicted_values, c='red', label='Predicted Values') _ = plt.scatter(x_test.get_value(), y_test.get_value(), facecolors='none', edgecolors='r', label='Sample Points') _ = plt.legend() #plt.show() return fig
def compute_likelihood_sigma(): """ Compute and plot the likelihood of the phase diffusion parameter, depending on the temperature. """ l_T = [34,37,40] l_likelihood_T = [] mean_IG = 24 domain_sigma = np.linspace(0.05, 0.3, 100) for T in l_T: path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = T, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) l_T_clock=[] for l_peak in ll_peak: l_idx_peak = [idx for idx, i in enumerate(l_peak) if i==1] for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]): #remove outliers T = (t_peak_2-t_peak_1)/2 if T<12 or T>38: #double or missing annotation pass else: l_T_clock.append(T ) l_likelihood = [] for sigma_theta in domain_sigma: lpx = np.log(1/sigma_theta) #lpx = 1 for T in l_T_clock: lpx = lpx + np.log(invgauss(T, mean_IG, 4*np.pi**2/sigma_theta**2)) l_likelihood.append( lpx/len(l_T_clock)) l_likelihood_T.append(l_likelihood) plt.plot(domain_sigma,l_likelihood_T[0], c = 'red', label = '34' ) plt.plot(domain_sigma,l_likelihood_T[1], c = 'blue',label = '37' ) plt.plot(domain_sigma,l_likelihood_T[2], c = 'orange',label = '40' ) plt.axvline(domain_sigma[np.argmax(l_likelihood_T[0])], c= 'red') plt.axvline(domain_sigma[np.argmax(l_likelihood_T[1])], c= 'blue') plt.axvline(domain_sigma[np.argmax(l_likelihood_T[2])], c= 'orange') plt.ylabel(r'$log(L(\sigma_\theta))$') plt.xlabel(r'$\sigma_\theta$' ) plt.legend() plt.savefig('Results/RawData/likelihood_sigma_theta.pdf') plt.show() plt.close()
def __init__(self, tickers, start='2014-01-01', end='2018-01-01', interval='1d', n_series=20, T_pred=10, n_cols=30, n_rows=30, T_space=10, train=True): self.folder = './' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + \ '/case' + str(n_series) + '_' + str(T_pred) + '_' + str(n_cols) + '_' + str(n_rows) + '_' + str(T_space) try: self.original = np.load(self.folder + '/original.npy') if train: self.x = np.load(self.folder + '/Xtrain.npy') self.y = np.load(self.folder + '/Ytrain.npy') else: self.x = np.load(self.folder + '/Xtest.npy') self.y = np.load(self.folder + '/Ytest.npy') except: ld = LoadData(tickers, start, end, interval) try: ld.unprocessed = pd.read_csv('./' + ''.join(tickers) + '_start' + start + '_end' + end + '_int' + interval + '/UnprocessedData.csv') except: print('DOWNLOADING DATA') ld.download() print('PROCESSING DATA') ld.process(n_series, T_pred, n_cols, n_rows, T_space, plot=True) ld.cut_and_shuffle() if train: self.x = ld.Xtrain self.y = ld.Ytrain else: self.x = ld.Xtest self.y = ld.Ytest self.original = ld.original # Shape of X: (Number of datasamples, Number of tickers, Number of rows, Number of columns) # Shape of Y: (Number of datasamples, Number of tickers) self.len = self.x.shape[0]
def train(): batch_size = 10 epochs = 50 bestloss = 1e10 learning_rate = 5e-4 Trainer = VGG16Trainer().cuda() path = './train' trainLabel = getLabel(path) traindata = LoadData(path, Label=trainLabel) dataloader = DataLoader(traindata, batch_size, shuffle=True) valLabel = getLabel('./val') valdata = LoadData('./val', Label=valLabel) valdataloader = DataLoader(valdata, batch_size, shuffle=True) count = 0 for epoch in range(epochs): if count == 5: learning_rate *= 0.5 for param_group in Trainer.optimizer.param_groups: param_group['lr'] = learning_rate if count == 10: break Trainer.train() totalloss = 0 for i_batch, batch_data in enumerate(dataloader): image = batch_data['image'] label = batch_data['label'].cuda() image = image.cuda().float() / 255. loss = Trainer.train_step(image, label) totalloss += loss print('train loss:') print(totalloss / len(dataloader)) Trainer.eval() valloss = 0 with torch.no_grad(): for i_batch, batch_data in enumerate(valdataloader): image = batch_data['image'] label = batch_data['label'].cuda() image = image.cuda().float() / 255. valloss += Trainer.forward(image, label) print('val loss:') valloss_a = valloss / len(valdataloader) print(valloss_a) if valloss_a < bestloss: bestloss = valloss_a print('saved') Trainer.save('VGG.pkl') count = 0 else: count += 1
def __init__(self, src, dst, dtype, location, dimension): self.src = int(src) self.dst = int(dst) self.org_node = LoadData.load_org_node(location) self.org_path = LoadData.load_org_path(location, dimension) self.link_table = LoadData.load_linking_table(location) # print('org_node', self.org_node) # print('org_path', self.org_path) # print('self.link_table', self.link_table) self.heap = Heap(dtype) self.quadrant = self.__get_quadrant(self.src, self.dst)
def train(self): l = LoadData() stopWords = l.loadStopWords() self.loadDataCSV('bbc-text.csv') vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = self.train_data['class_name'] X_test = vect.transform(self.test_data['data']) Y_test = self.test_data['class_name'] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred)
def __init__(self): self.rawdata = LoadData('.\ex1data1.txt') self.X, self.y, self.batch_size = self.rawdata.loadTXT() self.theta = np.array([[0.], [0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] print(self.theta.shape) print(self.batch_size) print(np.sum(self.X[:, 1])) print(np.sum(self.y))
def __init__(self): self.rawdata = LoadData('.\ex2data1.txt') self.X, self.y, self.batch_size, rawdata = self.rawdata.loadTXT() self.theta = np.array([[0.],[0.],[0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] self.rawdata_p = rawdata[np.where(rawdata[:,2]==1)] self.rawdata_n = rawdata[np.where(rawdata[:,2]==0)] # print(self.theta.shape) # print(self.batch_size) # print(self.rawdata_p) # print(self.rawdata_n) self.y = self.y[0]
def main(): # Data loading arg_parser = CSACCMArgs() args = arg_parser.args os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) data = LoadData(args.path, args.dataset, label=args.label, sep=args.sep, append_id=True, include_id=False) if args.verbose > 0: print(args) # Training t1 = time() model = CSACCM(feature_num=len(data.features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size, user_feature_num=len(data.user_features), item_feature_num=len(data.item_features), user_num=data.user_num, item_num=data.item_num, ui_vector_size=args.ui_vector_size, warm_ratio=args.warm_ratio, cb_hidden_layers=eval(args.cb_hidden_layers), attention_size=args.attention_size, optimizer=args.optimizer, learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch, dropout_keep=args.dropout_keep, l2_regularize=args.l2, verbose=args.verbose, random_seed=args.random_seed, model_path=args.model) if args.load == 1: model.load_model() # model.run_debug(data.test_data) # train model.train(data.train_data, data.validation_data, data.test_data, args.load == 1) model.print_result(t1) # test model.load_model() model.predict(data.test_data)
def feature_extraction_NFM(X, y_binary, i_Pos_sample_set, i_Neg_sample_set, args): #the labeled data will come from both pos and neg X_label = [X[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set)] y_label = [ y_binary[i] for i in set.union(i_Pos_sample_set, i_Neg_sample_set) ] X_train = np.asarray(X_label) Y_train = np.asarray(y_label) X_validation = np.asarray(X_label) Y_validation = np.asarray(y_label) X_test = copy.deepcopy( X ) #set the whole dataset for test, so that we will get the new features Y_test = copy.deepcopy(y_binary) data = LoadData(args.loss_type, X_train, Y_train, X_validation, Y_validation, X_test, Y_test) # Training model = NeuralFM(data.features_M, args.hidden_factor, args.layers, args.loss_type, args.pretrain, args.epoch, args.batch_size, args.lr, args.lamda, args.keep_prob, args.optimizer, args.batch_norm, args.activation_function, args.verbose, args.early_stop) model.train(data.Train_data, data.Validation_data, data.Test_data) features = model.get_deep_feature( data.Test_data) #model.get_bi_feature(data.Test_data)# return features
def MeanAndVarMapper(fileName): inputData = LoadData(fileName)[0] inputMat = numpy.mat(inputData) mean = numpy.mean(inputMat) var = numpy.var(inputData) count = len(inputData) return mean, var, count
def job(data, city, dimension, heuristic_matrices, pname): """ :param data: 起點陣列 :param heuristic_matrices: :param pname: 多線程的編號 :return: """ print('<---Processing ' + str(pname) + ' start--->') adjacency_list = LoadData.load_linking_table(city) count = 0 byte_limit = 1024 * 20 full_path = [] for (src, dst) in data: if src != dst: algorithm = Astar(src, dst, city, dimension, adjacency_list, heuristic_matrices) algorithm.query() path = algorithm.get_shortest_path() if bool(path): full_path.append(tuple(path)) # 輸出檔案,依檔案大小分檔案 if sys.getsizeof(full_path) >= byte_limit: Export.export_a_star(full_path, dimension, city, count) count += 1 full_path = [] print('<---End with ' + str(pname), '--->')
def main(): # Data loading arg_parser = BaseArgs() args = arg_parser.args os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) data = LoadData(args.path, args.dataset, label=args.label, append_id=False, include_id=True) if args.verbose > 0: print(args) # Training t1 = time() model = BaseModel(feature_num=len(data.features), optimizer=args.optimizer, learning_rate=args.lr, batch_size=args.batch_size, epoch=args.epoch, dropout_keep=args.dropout_keep, l2_regularize=args.l2, verbose=args.verbose, random_seed=args.random_seed, model_path=args.model) if args.load == 1: model.load_model() # model.run_debug(data.test_data) # train model.train(data.train_data, data.validation_data, data.test_data, args.load == 1) model.print_result(t1) # test model.load_model() model.predict(data.test_data)
def plot_hist_periods(cell, temperature, division): """ Given a cell condition, compute and plot a histgram of periods. Parameters ---------- cell : string Cell condition. """ ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "../Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" else: path = "../Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=division) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) \ = dataClass.load(load_annotation=True) if division: ##################### COMPUTE CELL CYCLE DISTRIBUTION ################## l_T_cell_cycle = [] for l_div_index in ll_idx_cell_cycle_start: for t1, t2 in zip(l_div_index[:-1], l_div_index[1:]): l_T_cell_cycle.append((t2 - t1) / 2) ##################### COMPUTE CIRCADIAN CLOCK DISTRIBUTION ################# l_T_clock = [] for l_peak in ll_peak: l_idx_peak = [idx for idx, i in enumerate(l_peak) if i == 1] for t_peak_1, t_peak_2 in zip(l_idx_peak[:-1], l_idx_peak[1:]): l_T_clock.append((t_peak_2 - t_peak_1) / 2) ##################### PLOT BOTH DISTRIBUTIONS ################## bins = np.linspace(8, 38, 40) if division: plt.hist(l_T_cell_cycle, bins, alpha=0.5, label='cell-cycle') plt.hist(l_T_clock, bins, alpha=0.5, label='clock') plt.legend(loc='upper right') if division: plt.savefig('../Results/RawData/Distributions_div_'+str(temperature)\ +"_"+cell+'.pdf', bbox_inches='tight') else: plt.savefig('../Results/RawData/Distributions_nodiv_'+str(temperature)\ +"_"+cell+'.pdf', bbox_inches='tight') plt.close()
def __init__(self): self.headerFiles={ "abslogout": "conf/wifi_abslogout_header", "login": "******", "auth": "conf/wifi_auth_header", "logout": "conf/wifi_logout_header" } self.paramFiles={ "abslogout": "conf/wifi_abslogout_params", "login": "******", "auth": "conf/wifi_auth_params", "logout": "conf/wifi_logout_params" } self.outFiles = { "abslogout": "out/abslogout.txt", "login": "******", "auth": "out/auth.txt", "logout": "out/logout.txt" } ld = LoadData() urls = ld.loadUrls('conf/urls') self.urls={ "abslogout":urls['abslogout'], "login": urls['login'], "logout": urls['logout'], "auth": urls['auth'] } edatas = ld.loadParams('conf/email',split="=") self.eml = MyEmail() self.eml.setUser(edatas['msg_from'],edatas['msg_to'],edatas['passwd']) # self.absLogoutUrl = "http://ipgw.neu.edu.cn/include/auth_action.php" # self.loginUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&" # self.logoutUrl = "http://ipgw.neu.edu.cn/srun_portal_pc.php?ac_id=1&" # self.authUrl = "http://ipgw.neu.edu.cn/include/auth_action.php?" self.ld = ld if not os.path.exists("out"): os.mkdir('out') if not os.path.exists('conf'): print("配置文件损坏,无法运行,请自行查看代码修复!很容易")
def optimize(self): """ Doing the actual optimiziation """ batch_size, finetune_lr = self.batch_size, self.finetune_lr batch_size, n_epochs = self.batch_size, self.training_epochs data = LoadData(self.link) datasets = data.load_data() train_set_x = datasets[0][0] n_train_batches = train_set_x.get_value( borrow=True).shape[0] // batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, output_layer=LinearRegression, n_ins=1, hidden_layers_sizes=[3, 3], n_outs=1) # Pretraining pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) self.__pretraining(dbn.n_layers, pretraining_fns, n_train_batches) # Backpropagation train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr) models = (train_fn, validate_model, test_model) finetuning = Optimization(batch_size=batch_size, n_epochs=n_epochs) finetuning.Backpropagation(models, datasets) test = theano.function(inputs=[dbn.x], outputs=dbn.output_layer.y_pred) prediction = test(datasets[2][0].get_value()) import pdb pdb.set_trace() return dbn
def loadData(self): l = LoadData() data_cluster = l.loadData('wine-clustering.csv') self.std_scaler = StandardScaler() self.min_max_scaler = MinMaxScaler() data_cluster[data_cluster.columns] = self.std_scaler.fit_transform( data_cluster) data_cluster[data_cluster.columns] = self.min_max_scaler.fit_transform( data_cluster) # print(data_cluster.mean()) # data = data_cluster.to_numpy() # np.savetxt('data.txt', data, fmt='%.1f') # coverience_matrix = np.dot(np.transpose(data), # data) / (data.shape[1] - 1) # np.savetxt('matrix.txt', coverience_matrix, fmt='%.1f') # pca self.pca_2 = PCA(2) self.pca_2_result = self.pca_2.fit_transform(data_cluster) self.data = data_cluster
def optimize(self): """ Doing the actual optimiziation """ batch_size, finetune_lr = self.batch_size, self.finetune_lr batch_size, n_epochs = self.batch_size, self.training_epochs data = LoadData(self.link) datasets = data.load_data() train_set_x = datasets[0][0] n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size # numpy random generator numpy_rng = numpy.random.RandomState(123) # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, output_layer = LinearRegression, n_ins=1, hidden_layers_sizes=[3, 3], n_outs=1) # Pretraining pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) self.__pretraining(dbn.n_layers, pretraining_fns, n_train_batches) # Backpropagation train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets = datasets, batch_size = batch_size, learning_rate = finetune_lr ) models = (train_fn, validate_model, test_model) finetuning = Optimization(batch_size = batch_size, n_epochs = n_epochs) finetuning.Backpropagation(models, datasets) test = theano.function(inputs = [dbn.x], outputs = dbn.output_layer.y_pred) prediction = test(datasets[2][0].get_value()) import pdb; pdb.set_trace() return dbn
def calculate_heuristic(city): """ 計算 heuristic matrices :param city: :return: """ org_node = LoadData.load_org_node(city) data = np.array(list(org_node.values())) result = euclidean_distances(data, data) header = [x for x in list(org_node.keys())] pd_data = pd.DataFrame(result, columns=header) # print(pd_data.shape) return pd_data
class GradientDescent(): #def __init__(self, X, y, theta, alpha, batch_size): def __init__(self): self.rawdata = LoadData('.\ex2data1.txt') self.X, self.y, self.batch_size, rawdata = self.rawdata.loadTXT() self.theta = np.array([[0.],[0.],[0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] self.rawdata_p = rawdata[np.where(rawdata[:,2]==1)] self.rawdata_n = rawdata[np.where(rawdata[:,2]==0)] # print(self.theta.shape) # print(self.batch_size) # print(self.rawdata_p) # print(self.rawdata_n) self.y = self.y[0] def sigmoid(self,inputs): inputs = np.array(inputs) sigmoid_scores = [1/float(1 + np.exp(-x)) for x in inputs[0]] return sigmoid_scores def costFunction(self): h = np.matrix(self.sigmoid(self.theta.transpose()*self.X)) print(-self.y.transpose()*np.log(h)) print((1-self.y.transpose())*np.log(1-h)) return 1/(self.batch_size) * np.sum(-self.y.transpose()*np.log(h) - (1-self.y.transpose())*np.log(1-h)) def gradientDescent(self): diff = self.theta.transpose()*self.X - self.y self.theta -= self.alpha*(1/self.batch_size) * self.X*diff.transpose() self.thetalst.append(self.theta) self.costlst.append(self.costFunction()) def plotCostJ(self): x = [i for i in range(len(self.costlst))] plt.plot(x, self.costlst) def plotCostJTheta(self): fig = plt.figure() ax = fig.gca(projection='3d') x = self.thetalst[0] y = self.thetalst[1] z = self.costlst ax.plot(x, y, z, label='parametric curve') ax.legend() plt.show()
def estimate_OU_par(cell, temperature, W=None, gamma_A=0.03, gamma_B=0.03): """ Estimate mean and variance of OU processes given a set of conditions, according to which a set of traces is filtered. Parameters ---------- cell : string Cell type. temperature : integer Temperature condition. W : list Waveform. gamma_A : float Regression parameter for the amplitude. gamma_b : float Regression parameter for the background. Returns ------- The mean and standard deviations of the amplitude and the background. """ ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C ######### print( 'CAUTION : Parameters for None temperature selected since not enough \ traces at 34°C and 40°C') temperature = None ##################### LOAD DATA ################ if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=False) elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass = LoadData(path, 10000000, temperature=temperature, division=True) try: (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) except: dataClass.path = '../' + dataClass.path (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) return estimate_OU_par_from_signal(ll_signal, W, gamma_A, gamma_B)
def estimate_phase_dev(cell, temperature): """ Final estimate for the circadian phase deviation. Parameters ---------- cell : string Cell condition. temperature : int Temperature condition. Returns ------- The standard deviation for the phase progression, and the periods. """ ######### CORRECTION BECAUSE NOT ENOUGH TRACES AT 34°C AND 40°C ######### print('CAUTION : Parameters for None temperature selected since not enough \ traces at 34°C and 40°C') temperature = None ##################### LOAD DATA ################## if cell == 'NIH3T3': path = "Data/NIH3T3.ALL.2017-04-04/ALL_TRACES_INFORMATION.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = False) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) #print(len(ll_area)) std, std_T = estimate_phase_dev_from_signal(ll_peak) elif cell == 'U2OS': path = "Data/U2OS-2017-03-20/ALL_TRACES_INFORMATION_march_2017.p" dataClass=LoadData(path, 10000000, temperature = temperature, division = True) (ll_area, ll_signal, ll_nan_circadian_factor, ll_obs_phi, ll_peak, ll_idx_cell_cycle_start, T_theta, T_phi) = \ dataClass.load(load_annotation=True) std, std_T = estimate_phase_dev_from_signal(ll_peak) #correction for the neglected coupling since dividing traces std = std*0.65 std_T = std_T *0.65 else: print("Cell type doesn't exist") ''' for (idx, l_signal), l_peak in zip(enumerate(ll_signal), ll_peak): plt.plot(l_signal) plt.plot(l_peak) plt.show() plt.close() if idx>17: break ''' return std, std_T
def train_1_class(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 2, 'Test size:', math.ceil(train_size * 0.25) * 2) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) # balance classes temp_class = self.data['class_name'][train_size:] temp_data = self.data['data'][train_size:] idx = random.choices(range(len(temp_class)), k=train_size) temp_class = [temp_class[i] for i in idx] temp_data = [temp_data[i] for i in idx] del self.data['data'][train_size:] del self.data['class_name'][train_size:] self.data['class_name'].extend(temp_class) self.data['data'].extend(temp_data) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = [ 1 if i == 'business' else 0 for i in self.train_data['class_name'] ] X_test = vect.transform(self.test_data['data']) Y_test = [ 1 if i == 'business' else 0 for i in self.test_data['class_name'] ] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')
class GradientDescent(): #def __init__(self, X, y, theta, alpha, batch_size): def __init__(self): self.rawdata = LoadData('.\ex1data1.txt') self.X, self.y, self.batch_size = self.rawdata.loadTXT() self.theta = np.array([[0.], [0.]]) self.alpha = 0.01 self.costlst = [] self.thetalst = [] print(self.theta.shape) print(self.batch_size) print(np.sum(self.X[:, 1])) print(np.sum(self.y)) def costFunction(self): return 1 / (2 * self.batch_size) * np.sum( np.square(self.theta.transpose() * self.X - self.y)) def gradientDescent(self): diff = self.theta.transpose() * self.X - self.y self.theta -= self.alpha * ( 1 / self.batch_size) * self.X * diff.transpose() self.thetalst.append(self.theta) self.costlst.append(self.costFunction()) def plotCostJ(self): x = [i for i in range(len(self.costlst))] plt.plot(x, self.costlst) def plotCostJTheta(self): fig = plt.figure() ax = fig.gca(projection='3d') x = self.thetalst[0] y = self.thetalst[1] z = self.costlst ax.plot(x, y, z, label='parametric curve') ax.legend() plt.show()
def load_model_and_predict(data=None): transaction_classifier = TransactionClassifier() transaction_classifier.load_model() if data is None: data = LoadData(x_input_features=[5, 6, 7]) data.load_processed_data() val_data = data.load_validation_data() test_data = data.load_test_data() print(transaction_classifier.get_confusion_matrix(val_data)) print(transaction_classifier.get_confusion_matrix(test_data))
def __init__(self): ld = LoadData() ld.load_csv() self.df = ld.df
def sgd_optimization(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ link = '/home/fabian/Documents/DeepLearningTutorials/data/DeepQ.npy' data = LoadData(link) datasets = data.load_data() x_train, y_train = datasets[0] x_valid, y_valid = datasets[1] x = T.matrix('x') index = T.lscalar('index') y = T.vector('y') n_in = 1 n_out = 1 batch_size = 20 import pdb; pdb.set_trace() # compute number of minibatches for training, validation and testing n_train_batches = x_train.get_value().shape[0] // batch_size n_valid_batches = x_valid.get_value().shape[0] // batch_size print('... building the model') classifier = PiecewiseLinear_Reinforcement(n_in = 1, input = x, n_out = n_out) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.loss(y) error = classifier.error(y) validate_model = theano.function( inputs=[index], outputs=[cost, error], givens={ x: x_valid[index * batch_size: (index + 1) * batch_size], y: y_valid[index * batch_size: (index + 1) * batch_size] } ) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=[cost, error], updates=updates, givens={ x: x_train[index * batch_size: (index + 1) * batch_size], y: y_train[index * batch_size: (index + 1) * batch_size] } ) test = [train_model(i) for i in range(n_train_batches)] print('... training the model') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.005 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) best_expected_utility = -1 * numpy.inf test_score = 0. #start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index)[0] # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set expected_utility = [-1*validate_model(i)[0] for i in range(n_valid_batches)] validation_error = [validate_model(i)[1] for i in range(n_valid_batches)] this_expected_utility = numpy.mean(expected_utility) this_validation_error = numpy.mean(validation_error) print( 'epoch %i, minibatch %i/%i, expected Utility %f validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_expected_utility, this_validation_error * 100. ) ) # if we got the best validation score until now if this_expected_utility > best_expected_utility: #improve patience if loss improvement is good enough if this_expected_utility > best_expected_utility * ( 1 + improvement_threshold): patience = max(patience, iter * patience_increase) best_expected_utility = this_expected_utility # test it on the test set # test_losses = [test_model(i) # for i in range(n_test_batches)] # test_score = numpy.mean(test_losses) # # print( # ( # ' epoch %i, minibatch %i/%i, test error of' # ' best model %f %%' # ) % # ( # epoch, # minibatch_index + 1, # n_train_batches, # test_score * 100. # ) # ) # save the best model # with open('best_model.pkl', 'wb') as f: # # pickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer()
def optimization(self): """ Does the optimization """ batch_size, n_epochs = self.batch_size, self.n_epochs data = LoadData(self.link) datasets = data.load_data() x_train, y_train = datasets[0] x_validate, y_validate = datasets[1] classifier, train_model, validate_model = self.__models(datasets) n_train_batches = x_train.get_value(borrow=True).shape[0] // batch_size n_valid_batches = x_validate.get_value(borrow=True).shape[0] // batch_size done_looping = False epoch = 0 train_loss, validation_loss = [], [] patience = 500000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_score = 0. while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): #import pdb; pdb.set_trace() minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set # test_losses = [test_model(i) # for i in range(n_test_batches)] # test_score = np.mean(test_losses) # # print( # ( # ' epoch %i, minibatch %i/%i, test error of' # ' best model %f %%' # ) % # ( # epoch, # minibatch_index + 1, # n_train_batches, # test_score * 100. # ) # ) # save the best model with open('best_model3.pkl', 'wb') as f: pickle.dump(classifier, f) if patience <= iter: done_looping = True break