def val(args): data_path = config['voc_path'] loader = VOCbase(data_path, is_transform=True, img_size=(args.img_rows, args.img_cols)) valloader = DataLoader(loader, batch_size=args.batch_size, num_workers=4) model = FCN8s() model.load(args.model_path) model.cuda() model.eval() n_classes = model.n_classes gts, preds = [], [] for i, (images, labels) in tqdm(enumerate(valloader)): images = Variable(images.cuda()) labels = Variable(labels.cuda()) outputs = model(images) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() for gt_, pred_ in zip(gt, pred): gts.append(gt_) preds.append(pred_) score, class_iou = scores(gts, preds, n_class=n_classes) for k, v in score.items(): print(k, v) for i in range(n_classes): print(i, class_iou[i])
def post(self): parser_copy = parser.copy() parser_copy.add_argument('year', type=str, required=True, help=u"学年不能为空") parser_copy.add_argument('term', type=str, required=True, help=u"学期不能为空") args = parser_copy.parse_args() cookies = _login() _scores = scores(cookies, args['year'], args['term']) return _scores
def validate(args): # Setup Dataloader data_loader = get_loader(args.dataset) data_path = get_data_path(args.dataset) loader = data_loader(data_path, split=args.split, is_transform=True) n_classes = loader.n_classes valloader = data.DataLoader(loader, batch_size=1) # Setup Model model = Net(n_classes) print(get_n_params(model)) model.load_state_dict(torch.load(args.model_path)) # print(model) model.eval() if torch.cuda.is_available(): model.cuda(0) gts, preds = [], [] for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) outputs = model(images) pred = outputs.data.max(1)[1].cpu().numpy().astype(np.int) gt = labels.data.cpu().numpy().astype(np.int) for gt_, pred_ in zip(gt, pred): gts.append(gt_) preds.append(pred_) # pred = pred.reshape(360, 480) # pred = decode_segmap(pred) # m.imsave('./images/{}.png'.format(i), pred) #break score, class_iou = scores(gts, preds, n_class=n_classes) for k, v in score.items(): print(k, v) for i in range(n_classes): print(i, class_iou[i])
def validate(args): # Setup Dataloader data_loader = get_loader(args.dataset) data_path = get_data_path(args.dataset) loader = data_loader(data_path, split=args.split, is_transform=True) n_classes = loader.n_classes valloader = data.DataLoader(loader, batch_size=args.batch_size) # Setup Model model = LinkNet(n_classes) model.load_state_dict(torch.load(args.model_path)) model.eval() if torch.cuda.is_available(): model.cuda(0) gts, preds = [], [] for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) t1 = time.time() outputs = model(images) t2 = time.time() print(t2 - t1) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() for gt_, pred_ in zip(gt, pred): gts.append(gt_) preds.append(pred_) score, class_iou = scores(gts, preds, n_class=n_classes) for k, v in score.items(): print k, v for i in range(n_classes): print i, class_iou[i]
def build_model(self): x = utils.input_batch_norm(self.X) h_fc1 = self._add_layers(x) concat_outputs = h_fc1 with tf.variable_scope('scores'): pred_y = utils.scores(h_fc1, [128, self.labels_num], [self.labels_num]) with tf.variable_scope('train'): lambda_loss_amount = 0.0015 l2 = lambda_loss_amount * \ sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables()) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=pred_y)) + l2 correct_prediction = tf.equal(tf.argmax(self.Y, 1), tf.argmax(pred_y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return concat_outputs, cross_entropy, accuracy, correct_prediction
def main(): parser = argparse.ArgumentParser(description='Omega integrals') parser.add_argument('-p', '--process', type=str, choices=["omega11", "omega12", "omega13", "omega22", "omegas"], default="omegas", help='Comma-separated names of omega integrals whose regression is performed') parser.add_argument('-a', '--algorithm', type=str, choices=['DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB', 'HGB', 'MLP'], default='DT', help='transport algorithm') parser.add_argument('-l', '--load_model', type=str2bool, nargs='?', choices=[False, True], default=False, const=True, help='Load saved model') args = parser.parse_args() process = args.process.split(',') print("Process: ", colored(process[0], 'green')) algorithm = args.algorithm.split(',') print("Algorithm: ", colored(algorithm[0],'blue')) load_model = args.load_model print("Load: ", colored(load_model,'magenta')) src_dir = "." print("SRC: ", colored(src_dir,'yellow')) output_dir = src_dir+"/.." print("OUTPUT: ", colored(output_dir,'red')) n_jobs = 2 # Import database with open('../data/omega_integrals_encoded.txt') as f: lines = (line for line in f if not line.startswith('#')) dataset = np.loadtxt(lines, skiprows=1) print(dataset.shape) x = dataset[:,0:3] # c, d, T y = dataset[:,3:] # Ω(1,1), Ω(1,2), Ω(1,3), Ω(2,2) print(x.shape) print(y.shape) print("### Phase 1: PRE_PROCESSING ###") ######################################## # 1.0) create directory tree model, scaler, figure = utils.mk_tree(process[0], algorithm[0], output_dir) # 1.1) train/test split dataset x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, test_size=0.25, random_state=69) # 1.2) scale data and save scalers sc_x = StandardScaler() sc_y = StandardScaler() sc_x.fit(x_train) x_train = sc_x.transform(x_train) x_test = sc_x.transform(x_test) sc_y.fit(y_train) y_train = sc_y.transform(y_train) y_test = sc_y.transform(y_test) print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) dump(sc_x, open(scaler+"/scaler_x_"+process[0]+'.pkl', 'wb')) dump(sc_y, open(scaler+"/scaler_y_"+process[0]+'.pkl', 'wb')) print("### Phase 2: PROCESSING ###") #################################### # 2.0) estimator selection if (algorithm[0] == 'DT'): est, hyper_params = estimators.est_DT() elif (algorithm[0] == 'ET'): est, hyper_params = estimators.est_ET() elif (algorithm[0] == 'SVM'): est, hyper_params = estimators.est_SVM() elif (algorithm[0] == 'KR'): est, hyper_params = estimators.est_KR() elif (algorithm[0] == 'KN'): est, hyper_params = estimators.est_KN() elif (algorithm[0] == 'MLP'): est, hyper_params = estimators.est_MLP() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() elif (algorithm[0] == 'HGB'): est, hyper_params = estimators.est_HGB() elif (algorithm[0] == 'RF'): est, hyper_params = estimators.est_RF() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() else: print("Algorithm not implemented ...") # 2.1) search for best hyper-parameters combination # Exhaustive search over specified parameter values for the estimator # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html gs = GridSearchCV(est, cv=3, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # Randomized search on hyper parameters # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, # cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, # return_train_score=False) #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', # refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # 2.2) training utils.fit(x_train, y_train, gs) # 2.3) prediction y_regr = utils.predict(x_test, gs) print("### Phase 3: POST-PROCESSING ###") ######################################### # 3.0) save best hyper-parameters results = pd.DataFrame(gs.cv_results_) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv') #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts) results.to_csv(model+"/../"+"GridSearchCV_results.csv", index=False, sep='\t', encoding='utf-8') # results print screen print("Best: %f using %s" % (gs.best_score_, gs.best_params_)) means = gs.cv_results_['mean_test_score'] stds = gs.cv_results_['std_test_score'] params = gs.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # 3.1) compute score metrics utils.scores(sc_x, sc_y, x_train, y_train, x_test, y_test, model, gs) # 3.2) back to original values (unscaling) x_test_dim = sc_x.inverse_transform(x_test) y_test_dim = sc_y.inverse_transform(y_test) y_regr_dim = sc_y.inverse_transform(y_regr) # 3.3) make plots utils.draw_plot(x_test_dim, y_test_dim, y_regr_dim, figure, process[0], algorithm[0]) # 3.4) save model to disk dump(gs, model+"/model_"+process[0]+".sav")
#y_val = data_val.target #X_val = data_val.drop('target', axis=1) #X_val = normalize(X_val) # Train test split X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size=1.0/6, shuffle =False ) # shufflle?? models = [GaussianNB(), SVC(random_state=5), RandomForestClassifier(random_state=5), MLPClassifier(random_state=5)] for model in models: model.fit(X_train, y_train) UTILS.scores(models, X_test, y_test) #print models[0].estimator.get_params().keys() ''' # Grid search for each model grid_data = [ {'kernel': ['rbf', 'sigmoid'], 'C': [0.1, 1, 10, 100], 'random_state': [5]}, {'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'random_state': [5]}, {'hidden_layer_sizes': [10, 50, 100], 'activation': ['identity', 'logistic', 'tanh', 'relu'],
def build_model(self): x_serie_c = self.X xs_s = tf.split(x_serie_c, num_or_size_splits=self.config.c_win_size, axis=1) ys_s = tf.split(self.YS, num_or_size_splits=self.config.c_win_size, axis=1) concat_outputs = [] self.losses = [] self.accuracies = [] self.correct_preds = [] with tf.variable_scope('simple_activity') as scope: is_reuse = False for i, j in zip(xs_s, ys_s): sa = SimpleActivity(i, tf.reshape(j, [-1, self.s_labels_num]), self.config, is_training=self.is_training, norm=self.norm) output, loss, accuracy, correct_pred_s = sa.build_model() concat_outputs.append(output) self.losses.append(loss) self.accuracies.append(accuracy) self.correct_preds.append(correct_pred_s) if not is_reuse: scope.reuse_variables() is_reuse = True self.s_mean_loss = tf.reduce_mean(self.losses) tf.summary.scalar('loss', self.s_mean_loss) self.s_mean_accuracy = tf.reduce_mean(self.accuracies) tf.summary.scalar('accuracy', self.s_mean_accuracy) self.train_step_s = tf.train.AdamOptimizer( self.learning_rate).minimize(self.s_mean_loss) with tf.variable_scope('complex_activity'): with tf.variable_scope("lstm_layers"): lstm_size = 128 cells = tf.contrib.rnn.MultiRNNCell( [utils.lstm_cell(lstm_size) for _ in range(3)], state_is_tuple=True) outputs, states = tf.contrib.rnn.static_rnn(cells, concat_outputs, dtype=tf.float32) pred_y_c = utils.scores(outputs[-1], [lstm_size, self.c_labels_num], [self.c_labels_num]) lambda_loss_amount = 0.0015 l2 = lambda_loss_amount * \ sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables()) cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.YC, logits=pred_y_c)) \ + l2 tf.summary.scalar("loss", cross_entropy) self.train_step_c = tf.train.AdamOptimizer( self.learning_rate).minimize(cross_entropy) tf.summary.scalar("learning_rate", self.learning_rate) self.joint_loss = cross_entropy + self.s_mean_loss self.c_loss = cross_entropy tf.summary.scalar("joint_loss", self.joint_loss) self.joint_train_step = tf.train.AdamOptimizer( self.learning_rate).minimize(self.joint_loss) self.correct_prediction_c = tf.equal(tf.argmax(self.YC, 1), tf.argmax(pred_y_c, 1)) self.c_accuracy = tf.reduce_mean( tf.cast(self.correct_prediction_c, tf.float32)) tf.summary.scalar("accuracy", self.c_accuracy)
uni=utils.cros_validation(regression.LinearRegression(regularization_factor=1.0),X,train_label,n_folds,random_grid) random_grid = {'n_estimators': [100,200,300,400,500,600,700,720,740,760,780,800]} lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=800, max_bin = 60, bagging_fraction = 0.8, bagging_freq = 5, feature_fraction = 0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11) model_lgb=utils.cros_validation(lgb,train.values,train_label,n_folds,random_grid) #call k-folds validation utils.scores('Lasso',utils.cv_rmse(lasso,train.values,train_label,n_folds)) utils.scores('Multivariate Linear Regression',utils.cv_rmse(multi,train.values,train_label,n_folds)) utils.scores('Univariate Linear Regression',utils.cv_rmse(uni,X,train_label,n_folds)) utils.scores('Gradient Boosting',utils.cv_rmse(model_lgb,train.values,train_label,n_folds)) #---------------------------Meta Learning ---------------------------------- sub = pd.DataFrame() sub['Id'] = test_id sub['SalePrice'] = np.exp(lasso.predict(test)*0.2+multi.predict(test)*0.2+model_lgb.predict(test)*0.6)
def train(args, out, net_name): data_path = get_data_path(args.dataset) data_loader = get_loader(args.dataset) loader = data_loader(data_path, is_transform=True) n_classes = loader.n_classes print(n_classes) kwargs = {'num_workers': 8, 'pin_memory': True} trainloader = data.DataLoader(loader, batch_size=args.batch_size, shuffle=True) another_loader = data_loader(data_path, split='val', is_transform=True) valloader = data.DataLoader(another_loader, batch_size=args.batch_size, shuffle=True) # compute weight for cross_entropy2d norm_hist = hist / np.max(hist) weight = 1 / np.log(norm_hist + 1.02) weight[-1] = 0 weight = torch.FloatTensor(weight) model = Bilinear_Res(n_classes) if torch.cuda.is_available(): model.cuda(0) weight = weight.cuda(0) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate, weight_decay=args.w_decay) # optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr_rate) scheduler = StepLR(optimizer, step_size=100, gamma=args.lr_decay) for epoch in tqdm.tqdm(range(args.epochs), desc='Training', ncols=80, leave=False): scheduler.step() model.train() loss_list = [] file = open(out + '/{}_epoch_{}.txt'.format(net_name, epoch), 'w') for i, (images, labels) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader), desc='Iteration', ncols=80, leave=False): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) optimizer.zero_grad() outputs = model(images) loss = cross_entropy2d(outputs, labels, weight=weight) loss_list.append(loss.data[0]) loss.backward() optimizer.step() # file.write(str(np.average(loss_list))) print(np.average(loss_list)) file.write(str(np.average(loss_list)) + '\n') model.eval() gts, preds = [], [] if (epoch % 10 == 0): for i, (images, labels) in tqdm.tqdm(enumerate(valloader), total=len(valloader), desc='Valid Iteration', ncols=80, leave=False): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) outputs = model(images) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() for gt_, pred_ in zip(gt, pred): gts.append(gt_) preds.append(pred_) score, class_iou = scores(gts, preds, n_class=n_classes) for k, v in score.items(): file.write('{} {}\n'.format(k, v)) for i in range(n_classes): file.write('{} {}\n'.format(i, class_iou[i])) torch.save( model.state_dict(), out + "/{}_{}_{}.pkl".format(net_name, args.dataset, epoch)) file.close()
pred[:text_len])).tolist() # convert tensor to list epoch_preds.append(pred_cut) for tag, text_len in zip( batch_tag, text_lens): # batch_tag: [seq_len, num_tags] tag_cut = tf.make_ndarray(tf.make_tensor_proto( tag[:text_len])).tolist() # convert tensor to list epoch_trues.append(tag_cut) progress_bar.update(1) # Convert epoch_idxs to epoch_tags epoch_tag_preds = utils.epoch_idx2tag(epoch_preds, idx2tag) epoch_tag_trues = utils.epoch_idx2tag(epoch_trues, idx2tag) # Calculate metrics for whole epoch train_scores = utils.scores(epoch_tag_trues, epoch_tag_preds) ### Valid ### epoch_preds, epoch_trues = [], [] with tqdm(total=len(list(valid_batches))) as progress_bar: for batch_seq, batch_tag in valid_batches: preds, text_lens = valid_fn(model, valid_loss, batch_seq, batch_tag) # Unpad preds/tags to the real lengths (for metrics) for pred, text_len in zip(preds, text_lens): # logit: [seq_len, num_tags] pred_cut = tf.make_ndarray( tf.make_tensor_proto( pred[:text_len])).tolist() # convert tensor to list epoch_preds.append(pred_cut)
def train(args): if (args.dataset == 'pascal'): another_loader = VOC2011ClassSeg(root='/home/vietdv', transform=True) loader = SBDClassSeg(root='/home/vietdv', transform=True, augment=True) else: data_path = get_data_path(args.dataset) label_scale = False if (args.model == 'encoder'): label_scale = True data_loader = get_loader(args.dataset) loader = data_loader(data_path, is_transform=True, augment=True, label_scale=label_scale) another_loader = data_loader(data_path, split='val', is_transform=True, label_scale=label_scale) n_classes = loader.n_classes trainloader = data.DataLoader(loader, batch_size=args.batch_size) valloader = data.DataLoader(another_loader, batch_size=1) # get weight for cross_entropy2d weight = loader.weight model = Net(n_classes) if torch.cuda.is_available(): model.cuda(0) weight = weight.cuda(0) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate, weight_decay=args.w_decay) criterion = CrossEntropyLoss2d(weight, False) # alpha = 0.5 lambda1 = lambda epoch: pow((1 - (epoch / args.epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) for epoch in range(args.epochs): model.train() loss_list = [] file = open(args.folder + '/{}_{}.txt'.format('hnet', epoch), 'w') scheduler.step(epoch) for i, (images, labels) in enumerate(trainloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) optimizer.zero_grad() outputs = model(images) # loss = alpha * criterion(outputs, labels) / len(images) + (1 - alpha) * lovasz_softmax(outputs, labels, ignore=n_classes-1) loss = criterion(outputs, labels) / len(images) print(loss.data[0]) loss_list.append(loss.data[0]) loss.backward() optimizer.step() file.write(str(np.average(loss_list)) + '\n') model.eval() gts, preds = [], [] for i, (images, labels) in enumerate(valloader): if torch.cuda.is_available(): images = Variable(images.cuda(0)) labels = Variable(labels.cuda(0)) else: images = Variable(images) labels = Variable(labels) outputs = model(images) pred = outputs.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() for gt_, pred_ in zip(gt, pred): gts.append(gt_) preds.append(pred_) score, class_iou = scores(gts, preds, n_class=n_classes) # scheduler.step(score['Mean IoU : \t']) for k, v in score.items(): file.write('{} {}\n'.format(k, v)) for i in range(n_classes - 1): file.write('{} {}\n'.format(i, class_iou[i])) torch.save( model.state_dict(), args.folder + "/{}_{}_{}.pkl".format('hnet', args.dataset, epoch)) file.close()
def main(): parser = argparse.ArgumentParser(description='reaction rates regression') parser.add_argument( '-p', '--process', type=str, choices=['DR', 'VT', 'VV', 'VV2', 'ZR'], default='DR,VT,VV,VV2,ZR', help='Comma-separated names of properties whose regression is performed' ) parser.add_argument('-a', '--algorithm', type=str, choices=[ 'DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB', 'HGB', 'MLP' ], default='DT', help='regression algorithm') args = parser.parse_args() process = args.process.split(',') directory = process[0] + '/data/processes' path = directory + "/*.csv" print("Process: ", colored(process[0], 'green')) algorithm = args.algorithm.split(',') print("Algorithm: ", colored(algorithm[0], 'blue')) parent_dir = "." print("PWD: ", colored(parent_dir, 'yellow')) n_jobs = 2 for f in glob.glob(path): #print("{bcolors.OKGREEN}f{bcolors.ENDC}") print(colored(f, 'red')) dataset_k = pd.read_csv(f, delimiter=",").to_numpy() dataset_T = pd.read_csv(parent_dir + "/" + process[0] + "/data/Temperatures.csv").to_numpy() x = dataset_T.reshape(-1, 1) y = dataset_k print("### Phase 1: PRE_PROCESSING ###") ######################################## ''' https://stackoverflow.com/questions/50565937/how-to-normalize-the-train-and-test-data-using-minmaxscaler-sklearn https://towardsdatascience.com/6-amateur-mistakes-ive-made-working-with-train-test-splits-916fabb421bb https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/ https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02 You should fit the MinMaxScaler using the training data and then apply the scaler on the testing data before the prediction. In summary: Step 1: fit the scaler on the TRAINING data Step 2: use the scaler to transform the TRAINING data Step 3: use the transformed training data to fit the predictive model Step 4: use the scaler to transform the TEST data Step 5: predict using the trained model (step 3) and the transformed TEST data (step 4). data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) model = SVC() model.fit(X_train_scaled, y_train) X_test_scaled = scaler.transform(X_test) y_pred = model.predict(X_test_scaled) ''' data, dir, proc, model, scaler, figure, outfile = utils.mk_tree( f, parent_dir, process[0], algorithm[0]) # Train/test split dataset x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, test_size=0.25, random_state=69) # Define scalers: they can be modified to investigate the effect of scalers ############################################################################## input_scaler = None #MinMaxScaler(feature_range=(-1,1)) output_scaler = None #StandardScaler() ############################################################################## # Scale None and/or inputs and/or outputs x_train, x_test, y_train, y_test = utils.scale_dataset( x_train, x_test, y_train, y_test, input_scaler, output_scaler) print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) # Save scalers (they may be useful) dump(input_scaler, open(scaler + "/scaler_x_MO_" + data + '.pkl', 'wb')) dump(output_scaler, open(scaler + "/scaler_y_MO_" + data + '.pkl', 'wb')) if (algorithm[0] == 'DT'): est, hyper_params = estimators.est_DT() elif (algorithm[0] == 'ET'): est, hyper_params = estimators.est_ET() elif (algorithm[0] == 'SVM'): est, hyper_params = estimators.est_SVM() elif (algorithm[0] == 'KR'): est, hyper_params = estimators.est_KR() elif (algorithm[0] == 'KN'): est, hyper_params = estimators.est_KN() elif (algorithm[0] == 'MLP'): est, hyper_params = estimators.est_MLP() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() elif (algorithm[0] == 'HGB'): est, hyper_params = estimators.est_HGB() elif (algorithm[0] == 'RF'): est, hyper_params = estimators.est_RF() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() else: print("Algorithm not implemented ...") # https://github.com/ray-project/tune-sklearn # https://docs.ray.io/en/latest/tune/api_docs/sklearn.html#tune-sklearn-docs # class ray.tune.sklearn.TuneGridSearchCV(estimator, param_grid, early_stopping=None, scoring=None, # n_jobs=None, cv=5, refit=True, verbose=0, error_score='raise', return_train_score=False, # local_dir='~/ray_results', max_iters=1, use_gpu=False, loggers=None, pipeline_auto_early_stop=True, # stopper=None, time_budget_s=None, sk_n_jobs=None) #scheduler = MedianStoppingRule(grace_period=10.0) #gs = TuneGridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', # refit=True, error_score=np.nan, return_train_score=True) #tune_search = TuneSearchCV(clf, parameter_grid, search_optimization="hyperopt", n_trials=3, early_stopping=scheduler, max_iters=10) #tune_search.fit(x_train, y_train) # Exhaustive search over specified parameter values for the estimator # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html gs = GridSearchCV(est, cv=5, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # Randomized search on hyper parameters # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, # cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, # return_train_score=False) #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', # refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # Training utils.fit(x_train, y_train, gs, outfile) results = pd.DataFrame(gs.cv_results_) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv') #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts) results.to_csv(model + "/../" + "GridSearchCV_results.csv", index=False, sep='\t', encoding='utf-8') #plt.figure(figsize=(12, 4)) #for score in ['mean_test_recall', 'mean_test_precision', 'mean_test_min_both']: # plt.plot([_[1] for _ in results['param_class_weight']], results[score], label=score) #plt.legend(); #plt.figure(figsize=(12, 4)) #for score in ['mean_train_recall', 'mean_train_precision', 'mean_test_min_both']: # plt.scatter(x=[_[1] for _ in results['param_class_weight']], y=results[score.replace('test', 'train')], label=score) #plt.legend(); # summarize results print("Best: %f using %s" % (gs.best_score_, gs.best_params_)) means = gs.cv_results_['mean_test_score'] stds = gs.cv_results_['std_test_score'] params = gs.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # Perform prediction y_regr = utils.predict(x_test, gs, outfile) # Compute the scores utils.scores(input_scaler, output_scaler, x_train, y_train, x_test, y_test, model, gs, outfile) # Transform back x_train, x_test, y_train, y_test, y_regr = utils.scale_back_dataset( x_train, x_test, y_train, y_test, y_regr, input_scaler, output_scaler) # Make figures utils.draw_plot(x_test, y_test, y_regr, figure, data) # save the model to disk dump(gs, model + "/model_MO_" + data + '.sav')
def main(): parser = argparse.ArgumentParser(description='relaxation terms regression') # parser.add_argument('-p', '--process', type=str, # choices=["shear", "bulk", "conductivity", "thermal_diffusion", "mass_diffusion"], # default="shear,bulk,conductivity,thermal_diffusion,mass_diffusion", # help='Comma-separated names of transport properties whose regression is performed') parser.add_argument('-a', '--algorithm', type=str, choices=[ 'DT', 'RF', 'ET', 'GP', 'KN', 'SVM', 'KR', 'GB', 'HGB', 'MLP' ], default='DT', help='regression algorithm') args = parser.parse_args() # process = args.process.split(',') # print("Process: ", colored(process[0], 'green')) algorithm = args.algorithm.split(',') print("Algorithm: ", colored(algorithm[0], 'blue')) src_dir = "." print("SRC: ", colored(src_dir, 'yellow')) output_dir = src_dir + "/.." print("OUTPUT: ", colored(output_dir, 'red')) n_jobs = 2 # Import database dataset = np.loadtxt("../data/transposed_reshaped_data.txt") # with open('../data/TCs_air5.txt') as f: # lines = (line for line in f if not line.startswith('#')) # dataset = np.loadtxt(lines, skiprows=1) print(dataset.shape) # if (process[0] == "shear"): # x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O # y = dataset[:,7:8] # shear viscosity # elif (process[0] == "bulk"): # x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O # y = dataset[:,8:9] # bulk viscosity # elif (process[0] == "conductivity"): # x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O # y = dataset[:,9:10]# thermal conductivity # elif (process[0] == "thermal_diffusion"): # x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O # y = dataset[:,10:] # thermal diffusion, D_Ti # elif (process[0] == "mass_diffusion"): # x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O # y = dataset[:,:] # mass diffusion TODO x = dataset[:, 0:50] # ni_n[47], na_n[1], V, T y = dataset[:, 50:] # RD_mol[47], RD_at[1] print(x.shape) print(y.shape) print("### Phase 1: PRE_PROCESSING ###") ######################################## # 1.0) create directory tree model, scaler, figure = utils.mk_tree(algorithm[0], output_dir) # 1.1) train/test split dataset x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, test_size=0.25, random_state=69) # 1.2) scale data and save scalers sc_x = StandardScaler() sc_y = StandardScaler() sc_x.fit(x_train) x_train = sc_x.transform(x_train) x_test = sc_x.transform(x_test) sc_y.fit(y_train) y_train = sc_y.transform(y_train) y_test = sc_y.transform(y_test) print('Training Features Shape:', x_train.shape) print('Training Labels Shape:', y_train.shape) print('Testing Features Shape:', x_test.shape) print('Testing Labels Shape:', y_test.shape) dump(sc_x, open(scaler + "/scaler_x.pkl", 'wb')) dump(sc_y, open(scaler + "/scaler_y.pkl", 'wb')) print("### Phase 2: PROCESSING ###") #################################### # 2.0) estimator selection if (algorithm[0] == 'DT'): est, hyper_params = estimators.est_DT() elif (algorithm[0] == 'ET'): est, hyper_params = estimators.est_ET() elif (algorithm[0] == 'SVM'): est, hyper_params = estimators.est_SVM() elif (algorithm[0] == 'KR'): est, hyper_params = estimators.est_KR() elif (algorithm[0] == 'KN'): est, hyper_params = estimators.est_KN() elif (algorithm[0] == 'MLP'): est, hyper_params = estimators.est_MLP() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() elif (algorithm[0] == 'HGB'): est, hyper_params = estimators.est_HGB() elif (algorithm[0] == 'RF'): est, hyper_params = estimators.est_RF() elif (algorithm[0] == 'GB'): est, hyper_params = estimators.est_GB() else: print("Algorithm not implemented ...") # 2.1) search for best hyper-parameters combination # Exhaustive search over specified parameter values for the estimator # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # Randomized search on hyper parameters # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV # class sklearn.model_selection.RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, # cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, # return_train_score=False) #gs = RandomizedSearchCV(est, cv=10, n_iter=10, param_distributions=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', # refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) # 2.2) training utils.fit(x_train, y_train, gs) # 2.3) prediction y_regr = utils.predict(x_test, gs) print("### Phase 3: POST-PROCESSING ###") ######################################### # 3.0) save best hyper-parameters results = pd.DataFrame(gs.cv_results_) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html #compression_opts = dict(method='zip', archive_name='GridSearchCV_results.csv') #results.to_csv('GridSearchCV_results.zip', index=False, compression=compression_opts) results.to_csv(model + "/../" + "GridSearchCV_results.csv", index=False, sep='\t', encoding='utf-8') # results print screen print("Best: %f using %s" % (gs.best_score_, gs.best_params_)) means = gs.cv_results_['mean_test_score'] stds = gs.cv_results_['std_test_score'] params = gs.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # 3.1) compute score metrics utils.scores(sc_x, sc_y, x_train, y_train, x_test, y_test, model, gs) # 3.2) back to original values (unscaling) x_test_dim = sc_x.inverse_transform(x_test) y_test_dim = sc_y.inverse_transform(y_test) y_regr_dim = sc_y.inverse_transform(y_regr) # 3.3) make plots utils.draw_plot(x_test_dim, y_test_dim, y_regr_dim, figure) # 3.4) save model to disk dump(gs, model + "/model.sav")
print(df_raw.isnull().sum().sort_index() / len(df_raw)) ''' Fill the missing value with mean, and use codes to represent categories.''' df, y = utils.process(df_raw, 'SalePrice') print(df.head(1)) ''' Use all data to train will lead to overfitting.''' # m = RandomForestRegressor(n_jobs=-1) # m.fit(df, y) # # `m.score` will return r² value (1 is good, 0 is bad) # print(m.score(df, y)) ''' Split the data to train set and validate set.''' validate_size = 12000 # kaggle test set size train_size = len(df) - validate_size X_train, X_valid = utils.split(df, train_size) y_train, y_valid = utils.split(y, train_size) print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape) ''' Take sample to train will save a lot of time.''' df, y = utils.process(df_raw, 'SalePrice', sample_size=30000) ''' Don't change the validate set.''' X_train, _ = utils.split(df, 20000) y_train, _ = utils.split(y, 20000) print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape) m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1) m.fit(X_train, y_train) print(utils.scores(m, X_train, y_train, X_valid, y_valid))