GDSC_drug_id, GDSC_drug_response_df, normalized_data_df[source_data_key]) y_values = y.values.astype(np.float32) X_values = X.values.astype(np.float32) ## Cross-validation # Splitting data split_folds = KFold(n_splits=5, shuffle=True, random_state=random_state) split_folds.get_n_splits() # Import parameters param_dict = load(open('./params/sakellaropoulos_param_grid.pkl', 'rb')) parameter_grid = ParameterGrid(param_dict) if GDSC_drug_name not in os.listdir(output_folder): os.mkdir(output_folder + GDSC_drug_name) output_folder += GDSC_drug_name + '/' # Cross-validation for each parameter in the grid for param_idx, param in enumerate(parameter_grid): print('START PARAM NUMBER %s'%(param_idx)) param['n_input'] = X_values.shape[1] network_folder = make_figure_folder(output_folder, param) y_predict_nn = np.zeros(y_values.shape[0]) for split_idx, (train_index, valid_index) in enumerate(split_folds.split(X_values, y_values)):
def GridSearch(self, batch_size_, num_epochs, random_state, gmout="GridSearchResult"): """ Run GridSearch to find best parameters. """ # train_keys, test_keys = MDCTrainTestSplit(self.target, 0) # train_keys, test_keys = DISCTrainTestSplit(self.target) train_keys, test_keys = TrainTestSplit(list(self.target.keys()), test_size=0.20, random_state=random_state) print("Train set size: %d Test set size %d" % (len(train_keys), len(test_keys))) # train_steps_per_epoch = ceil(len(train_keys)/loat(batch_size_)) # train_generator = self.DataGenerator(train_keys, batch_size_) x_train, y_train = self.GenData(train_keys) # This is unstable # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_)) # test_generator = self.DataGenerator(test_keys, batch_size_) # This is more stable x_test, y_test = self.GenData(test_keys) # PARAMETER DEFINITIONS # simple architecture param = {} param["nunits"] = [200, 400, 800] param["ndense_layers"] = [2, 4, 6] # param["dropout"] = ["on", "off"] param["dropout"] = ["on"] # param["activation"] = ["relu", "leakyrelu"] param["activation"] = ["relu"] """ # resnet architecture param = {} param["nunits"] = [200, 400, 600, 800] param["ndense_layers"] = [2, 4, 6] """ all_combo = list(ParameterGrid(param)) print("Evaluating %d combinations of parameters" % (len(all_combo))) already_computed_combo = [] if Path(gmout).is_file(): fi = open(gmout, "r") for line in fi: v = str.split(line.strip(), " ") # simple architecture units = v[0] layers = v[1] act = v[2] drop = v[3] s = ("%s-%s-%s-%s" % (units, layers, act, drop)) """ # resnet architecture units = v[0] layers = v[1] s = ("%s-%s" % (units, layers)) """ already_computed_combo.append(s) fi.close() for c in all_combo: # simple architecture s = ("%s-%s-%s-%s" % (c["nunits"], c["ndense_layers"], c["activation"], c["dropout"])) """ # resnet architecture s = ("%s-%s" % (c["nunits"], c["ndense_layers"])) """ if s in already_computed_combo: print("%s already computed... skip..." % (s)) else: model = build_gridsearch_model(self.nfeatures, c["ndense_layers"], c["nunits"], c["activation"], c["dropout"]) """ model = build_dnn_resnet_model(self.nfeatures, c["nunits"], c["ndense_layers"]) """ print(model.summary()) b = batch_size_ """ model_name = ("#b%d_#e%d_#u%d_#dl%d_act-%s_dp-%s" % (b, num_epochs, c["nunits"], c["ndense_layers"], c["activation"], c["dropout"])) """ model_name = ("#b%d_#e%d_#u%d_#dl%d" % (b, num_epochs, c["nunits"], c["ndense_layers"])) log_dir_ = ("./logs/%s" % (model_name)) log_dir_ += time.strftime("%Y%m%d%H%M%S") model_output = "%s.h5" % (model_name) callbacks_ = [ TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), ModelCheckpoint(model_output, monitor='val_loss', verbose=0, save_best_only=True) ] """ callbacks_ = [TensorBoard(log_dir=log_dir_, histogram_freq=0, write_graph=False, write_images=False), EarlyStopping(monitor='val_loss', min_delta=0, patience=50, verbose=0, mode='auto')] """ model.fit(x_train, y_train, epochs=num_epochs, batch_size=b, verbose=self.verbose, validation_data=(x_test, y_test), callbacks=callbacks_) bestmodel = GetLoadModelFnc()(model_output) yrecalc_train = bestmodel.predict(x_train) """ model.fit_generator(train_generator, steps_per_epoch=train_steps_per_epoch, epochs=num_epochs, verbose=1, validation_data=(x_test, y_test), # validation_data=test_generator, # validation_steps=test_steps_per_epoch, callbacks=callbacks_) yrecalc_train = [] y_train = [] for key in train_keys: a = np.array([self.X_raw[key]]) yrecalc_train.extend(model.predict(a)) y_train.append(self.target[key]) """ ypred_test = bestmodel.predict(x_test) r2 = RSQ(y_train, yrecalc_train) mse_train = MSE(y_train, yrecalc_train) mae_train = MAE(y_train, yrecalc_train) q2 = RSQ(y_test, ypred_test) mse_test = MSE(y_test, ypred_test) mae_test = MAE(y_test, ypred_test) print("R2: %.4f Q2: %.4f" % (r2, q2)) fo = open("%s" % (gmout), "a") """ # simple architecture fo.write("%d %d %s %s %f %f %f %f %f %f %f %f\n" % (c["nunits"], c["ndense_layers"], c["activation"], c["dropout"], mse_train, mae_train, r2, train_score, mse_test, mae_test, q2, test_score)) """ # resnet architecture fo.write("%d %d %f %f %f %f %f %f\n" % (c["nunits"], c["ndense_layers"], mse_train, mae_train, r2, mse_test, mae_test, q2)) fo.close()
# Necessary to add cwd to path when script run # by SLURM (since it executes a copy) sys.path.append(os.getcwd()) df = pd.read_csv('notes.csv') #2,083,180 notes df = df[~(df['iserror'] == 1)] #886 notes with errors termsToRemove = ['Admission Date:', 'Discharge Date:', 'Service:', 'ADDENDUM:', 'Dictated By:', 'Completed by:', 'D:', 'T:', 'JOB#:', '\?\?\?\?\?\?', 'INTERPRETATION:', 'Findings', 'Attending Physician:', 'Referral date:'] texts = df['text'].str.replace(r'(%s)'%('|'.join(termsToRemove)),'').str.replace(r'\[\*\*.*?\*\*\]', '') toks = texts.str.lower().apply(nltk.word_tokenize).values print(sum(map(len, toks)), 'tokens') #param_grid = {'size': [100, 200, 300], # 'min_count': [5, 10, 50], # 'window': [1, 5, 20]} param_grid = {'size': [200, 300], 'min_count': [2,3,4], 'window': [20] } for i in ParameterGrid(param_grid): #skipgrams model = gensim.models.Word2Vec(toks, sg=1, workers = 80, seed =42, **i) pickle.dump(model, open('embeddings/%sd_%sc_%sw'%(i['size'], i['min_count'], i['window']), 'wb'))
# }, # { # "h_sizes": [80, 80, 80], # "dense_size": 40, # "pos_emb_size": 50, # "cnn_win_size": 7, # "suffix_prefix_dims": 70, # "suffix_prefix_buckets": 3000, # "target_emb_dim": 25, # "mention_emb_dim": 25, # } ], "learning_rate": [0.0001, 0.001, 0.00001], "learning_rate_decay": [0.998] # 0.991 } def flatten(params): new_params = {} for key, val in params.items(): if type(val) is dict: new_params.update(val) else: new_params[key] = val return new_params cnn_params = list(map(flatten, ParameterGrid(cnn_params_grids))) att_params = list(map(flatten, ParameterGrid(att_params_grids)))
def _get_param_iterator(self): return ParameterGrid(self.param_grid)
from utils import df2mapk EPOCH_NUM = 1 PARAM_SEARCHED = [ { 'a': [0.01, 0.03, 0.1], 'b': [1.0], # based on convention 'l1': [8.0, 10.0, 13.0], 'l2': [1.0], 'n': [2**29], 'interaction': [True], 'epoch': [1], }, ] FTRL_PARAM = list(ParameterGrid(PARAM_SEARCHED)) # {'a': 0.1, 'b': 1., 'l1': 1., 'l2': 0.001, 'n': 2**24, 'epoch': EPOCH_NUM, 'interaction': False} def main(): data_fname_base = sys.argv[1] subtrain_fname = '../subtrain_' + data_fname_base subtrain_id_fname = subtrain_fname + '.id' validation_fname = '../validation_' + data_fname_base validation_id_fname = validation_fname + '.id' test_fname = '../test_' + data_fname_base test_id_fname = test_fname + '.id' print('Load subtrain data...') x_subtrain, y_subtrain = load_svmlight_file(subtrain_fname) print('Load validation data...')
iImage = iImage.resize(Resize) iImage = numpy.array(iImage) iImage = iImage.astype("float32") iImage = iImage / 255 Valid["Image"].append(iImage) Valid["Image"] = numpy.array(Valid["Image"]) Valid["EncodeLabelCode"] = to_categorical(Valid["Label"]) Valid["Variable"] = Valid["Table"][Variable] ################################################################################ ## Parameter control Parameter = {} Parameter["Batch"] = [16, 16, 16] Parameter["Epoch"] = [50] Parameter["LearnRate"] = [0.0001,0.0001] Parameter["Optimizer"] = ["Adam"] Parameter = list(ParameterGrid(Parameter)) ## ## Tune result TuneResult = {} TuneResult["Batch"] = [] TuneResult["Epoch"] = [] TuneResult["LearnRate"] = [] TuneResult["TrainAUC"] = [] TuneResult["TrainAccuracy"] = [] TuneResult["ValidAUC"] = [] TuneResult["ValidAccuracy"] = [] ## ## The better model TheBetterModel = "Empty" print("Prepare!") ################################################################################
def optimize_coefficients(num_coeff=3, loss_func=None, phi=1.0, max_cost=2.0, search_per_coeff=4, sort_by_loss=False, save_coeff=True, tol=None, verbose=True): """ Computes the possible values of any number of coefficients, given a cost function, phi and max cost permissible. Takes into account the search space per coefficient so that the subsequent grid search does not become prohibitively large. # Arguments: num_coeff: number of coefficients that must be optimized. cost_func: coefficient cost function that minimised to satisfy the least squares solution. The function can be user defined, in which case it must accept a numpy vector of length `num_coeff` defined above. It is suggested to use MSE against a pre-refined `max_cost`. phi: The base power of the parameters. Kept as 1 for initial search of base parameters. max_cost: The maximum cost of permissible. User defined constant generally set to 2. search_per_coeff: int declaring the number of values tried per coefficient. Constructs a search space of size `search_per_coeff` ^ `num_coeff`. sort_by_loss: bool. Whether to sort the result set by its loss value, in order of lowest loss first. save_coeff: bool, whether to save the resulting coefficients into the file `param_coeff.npy` in current working dir. tol: float tolerance of error in the cost function. Used to select candidates which have a cost less than the tolerance. verbose: bool, whether to print messages during execution. # Returns: A numpy array of shape [search_per_coeff ^ num_coeff, num_coeff], each row defining the value of the coefficients which minimise the cost function satisfactorily (to some machine precision). """ phi = float(phi) max_cost = float(max_cost) search_per_coeff = int(search_per_coeff) # if user defined cost function is not provided, use the one from # the paper in reference. if loss_func is None: loss_func = get_compound_coeff_func(phi, max_cost) # prepare inequality constraints ineq_constraints = {'type': 'ineq', 'fun': lambda x: x - 1.} # Prepare a matrix to store results num_samples = search_per_coeff**num_coeff param_range = [num_samples, num_coeff] # sorted by ParameterGrid acc to its key value, assuring sorted # behaviour for Python < 3.7. grid = { i: np.linspace(1.0, max_cost, num=search_per_coeff) for i in range(num_coeff) } if verbose: print("Preparing parameter grid...") print("Number of parameter combinations :", num_samples) param_grid = ParameterGrid(grid) if _joblib_available: with Parallel(n_jobs=-1, verbose=10 if verbose else 0) as parallel: param_set = parallel( delayed(_joblib_optimize)(param, loss_func, num_coeff, ineq_constraints) for param in param_grid) param_set = np.asarray(param_set) else: if verbose and num_samples > 1000: print("Consider using `joblib` library to speed up sequential " "computation of {} combinations of parameters".format( num_samples)) param_set = np.zeros(param_range) param_set = _sequential_optimize(param_grid, param_set, loss_func, num_coeff=num_coeff, ineq_constraints=ineq_constraints, verbose=verbose) # compute a minimum tolerance of the cost function # to select it in the candidate list. if tol is not None: if verbose: print("Filtering out samples below tolerance threshold...") tol = float(tol) cost_scores = np.asarray([loss_func(xi) for xi in param_set]) param_set = param_set[np.where(cost_scores <= tol)] else: cost_scores = None # sort by lowest loss first if sort_by_loss: if verbose: print("Sorting by loss...") if cost_scores is None: cost_scores = ([loss_func(xi) for xi in param_set]) else: cost_scores = cost_scores.tolist() cost_scores_id = [(idx, loss) for idx, loss in enumerate(cost_scores)] cost_scores_id = sorted(cost_scores_id, key=lambda x: x[1]) ids = np.asarray([idx for idx, loss in cost_scores_id]) # reorder the original param set param_set = param_set[ids, ...] if save_coeff: np.save('param_coeff.npy', param_set) return param_set
def optimize_coefficients(num_coeff=3, cost_func=None, phi=1.0, max_cost=2.0, search_per_coeff=4, save_coeff=True, tol=None): """ Computes the possible values of any number of coefficients, given a cost function, phi and max cost permissible. Takes into account the search space per coefficient so that the subsequent grid search does not become prohibitively large. # Arguments: num_coeff: number of coefficients that must be optimized. cost_func: coefficient cost function that minimised to satisfy the least squares solution. The function can be user defined, in which case it must accept a numpy vector of length `num_coeff` defined above. It is suggested to use MSE against a pre-refined `max_cost`. phi: The base power of the parameters. Kept as 1 for initial search of base parameters. max_cost: The maximum cost of permissible. User defined constant generally set to 2. search_per_coeff: int declaring the number of values tried per coefficient. Constructs a search space of size `search_per_coeff` ^ `num_coeff`. save_coeff: bool, whether to save the resulting coefficients into the file `param_coeff.npy` in current working dir. tol: float tolerance of error in the cost function. Used to select candidates which have a cost less than the tolerance. # Returns: A numpy array of shape [search_per_coeff ^ num_coeff, num_coeff], each row defining the value of the coefficients which minimise the cost function satisfactorily (to some machine precision). """ phi = float(phi) max_cost = float(max_cost) search_per_coeff = int(search_per_coeff) # if user defined cost function is not provided, use the one from # the paper in reference. if cost_func is None: cost_func = get_compound_coeff_func(phi, max_cost) # prepare inequality constraints ineq_constraints = { 'type': 'ineq', 'fun': lambda x: x - 1. } # Prepare a matrix to store results param_range = [search_per_coeff ** num_coeff, num_coeff] param_set = np.zeros(param_range) # sorted by ParameterGrid acc to its key value, assuring sorted # behaviour for Python < 3.7. grid = {i: np.linspace(1.0, max_cost, num=search_per_coeff) for i in range(num_coeff)} param_grid = ParameterGrid(grid) for ix, param in enumerate(param_grid): # create a vector for the cost function and minimise using SLSQP x0 = np.array([param[i] for i in range(num_coeff)]) res = minimize(cost_func, x0, method='SLSQP', constraints=ineq_constraints) param_set[ix] = res.x # compute a minimum tolerance of the cost function # to select it in the candidate list. if tol is not None: tol = float(tol) cost_scores = np.array([cost_func(xi) for xi in param_set]) param_set = param_set[np.where(cost_scores <= tol)] if save_coeff: np.save('param_coeff.npy', param_set) return param_set
show_decision_boundary(rp.clf, 'Decision boundary for LogisticRegression (+rankpruning) trained with noisy labels.\n Test Accuracy: ' + str(round(rp_score, 3))) except: print("Plotting is only supported in an iPython interface.") # In[5]: param_grid = { "prune_method": ["prune_by_noise_rate", "prune_by_class", "both"], "converge_latent_estimates": [True, False], } # Fit LearningWithNoisyLabels across all parameter settings. from sklearn.model_selection import ParameterGrid params = ParameterGrid(param_grid) scores = [] for param in params: clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto') rp = LearningWithNoisyLabels(clf = clf, **param) _ = rp.fit(X_train, s) # s is the noisy y_train labels scores.append(accuracy_score(rp.predict(X_test), y_test)) # Print results sorted from best to least for i in np.argsort(scores)[::-1]: print("Param settings:", params[i]) print( "Accuracy (using confident learning):\t", round(scores[i], 2), "\n" )
def tune(x_train, y_train_orig, cols): logger.info('{}'.format(cols)) x_train = x_train[cols].values # [:, FEATURE] if IS_LOG: y_train = np.log1p(y_train_orig) else: y_train = y_train_orig logger.info('x_shape: {}'.format(x_train.shape)) # x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242) all_params = { 'eta': [0.05], 'max_depth': [5], 'subsample': [0.7], 'colsample_bytree': [0.7], 'objective': ['reg:linear'], #'eval_metric': [['rmse', rmsel_metric]], 'silent': [1] } min_score = (100, 100, 100) min_params = None use_score = 0 cv = np.arange(x_train.shape[0]) for params in list(ParameterGrid(all_params)): #cv = TimeSeriesSplit(n_splits=5).split(x_train) cnt = 0 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) for train, test in [[cv[:-VALID_NUM], cv[-VALID_NUM:]]]: trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] dtrain = xgb.DMatrix(trn_x, trn_y) dtest = xgb.DMatrix(val_x, val_y) clf = xgb.train( params, dtrain, feval=rmsel_metric, evals=[(dtest, 'val')], num_boost_round=1000, # 384, early_stopping_rounds=100) pred = clf.predict(dtest) all_pred[test] = pred _score = rmsel(val_y, pred) _score2 = rmse( val_y, pred) # np.exp(pred) - 1) # - roc_auc_score(val_y, pred) # logger.debug(' _score: %s' % _score) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != -1: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) # with open('tfidf_all_pred2_7.pkl', 'wb') as f: # pickle.dump(all_pred, f, -1) logger.info('trees: {}'.format(list_best_iter)) params['n_estimators'] = np.mean(list_best_iter, dtype=int) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params)) logger.info('loss: {} (avg min max {})'.format(score[use_score], score)) logger.info('score: {} (avg min max {})'.format( score2[use_score], score2)) if min_score[use_score] > score[use_score]: min_score = score min_score2 = score2 min_params = params logger.info('best score: {} {}'.format(min_score[use_score], min_score)) logger.info('best score2: {} {}'.format(min_score2[use_score], min_score2)) logger.info('best_param: {}'.format(min_params)) gc.collect() return min_score[use_score]
'DROPOUT': [0.2], 'ACTIVATION2': ['relu'], 'OPTIMIZER': ['Adam'] } counter = 0 grid_results = pd.DataFrame(columns=[ 'counter', 'col_selection', 'HISTORY', 'STEP_FACTOR', 'LEVEL', 'WAVELET', 'FILTERS1', 'KERNEL_SIZE1', 'PADDING1', 'DILATION_RATE1', 'ACTIVATION1', 'CONV2', 'POOL_SIZE1', 'CONV_POOL2', 'DENSE_UNITS', 'DROPOUT', 'ACTIVATION2', 'OPTIMIZER', 'mse', 'mae', 'train_samples', 'test_samples', 'train_time' ]) output_path = results_path + 'gridsearch_results.csv' param_grid = ParameterGrid(parameters) for dict_ in param_grid: start = time() conv_mse, mae, train_shape, test_shape, predictions = grid_search( df, dict_['col_selection'], dict_['HISTORY'], dict_['STEP_FACTOR'], dict_['LEVEL'], dict_['WAVELET'], dict_['FILTERS1'], dict_['KERNEL_SIZE1'], dict_['PADDING1'], dict_['DILATION_RATE1'], dict_['ACTIVATION1'], dict_['CONV2'], dict_['POOL_SIZE1'], dict_['CONV_POOL2'], dict_['DENSE_UNITS'], dict_['DROPOUT'], dict_['ACTIVATION2'], dict_['OPTIMIZER'], counter) train_time = round(time() - start, 2) grid_results = grid_results.append( { 'counter': counter, 'col_selection': dict_['col_selection'], 'HISTORY': dict_['HISTORY'],
def find_best_params_GridSearchCV(self, gridsearch, X_local, y_local): """ Find best set of parameters given a grid-search round across various folds. """ """ Generates all combination of hyperparameters """ hyperparam_space = list(ParameterGrid(gridsearch)) if len(hyperparam_space) == 1: return hyperparam_space[0] """ Splits local folds """ kf = KFold(n_splits=self.cross_validation) kf.get_n_splits(self.X_train) folds = [] for train_index, test_index in kf.split(X_local): folds.append([pd.Series(train_index), pd.Series(test_index)]) if self.verbose: n_folds = len(folds) n_combinations = len(hyperparam_space) msg = "Fitting %s models...\n" print(msg % (n_combinations)) """ Performs gridsearch """ #Stores performance, Stores classification reports performance = [] for params_it, params in enumerate(hyperparam_space): time_start = time.time() #Evaluation rounds local_results = [] for fold_it, fold in enumerate(folds): X_train = X_local.iloc[fold[0]] X_test = X_local.iloc[fold[1]] y_train = y_local.iloc[fold[0]] y_test = y_local.iloc[fold[1]] params['eval_set'] = [(X_test, y_test)] alg = xgbw.XGBWrapper(**params) alg.fit(X_train, y_train) pred_test = alg.predict(X_test) local_report = class_report(y_true=y_test, y_pred=pred_test) local_results.append(local_report) #Stores performance evaluation given the performance-policy metric, statistic = self.performance_politic.split('__') local_performance = [] for local_report in local_results: local_report = local_report.drop('Global') metric_results = local_report[metric] if statistic == 'min': metric_stat = metric_results.min() elif statistic == 'max': metric_stat = metric_results.max() elif statistic == 'mean': metric_stat = metric_results.mean() local_performance.append(metric_stat) local_performance = pd.Series(local_performance) performance.append(local_performance) time_end = time.time() elapsed_time = (time_end - time_start) if self.gridsearch_verbose: msg = "%s of %s - %s: %s - %s s" % ( (params_it + 1), n_combinations, self.performance_politic, round(local_performance.mean(), 4), round(elapsed_time, 2)) print(msg) for param_name in params.keys(): if param_name != 'eval_set': msg = "\t%s: %r" % (param_name, params[param_name]) if self.verbose: print(msg) print('') performance = pd.DataFrame(performance) mean_performance = performance.mean(axis=1) idx_best = mean_performance.idxmax() best_parameters = hyperparam_space[idx_best] return best_parameters
def run_gridsearch(self, cv, cv_score: str) -> None: """ Performs a gridsearch over the tuning hyperparameters. Determines the best hyperparameters based on the average validation performance calculated over cross-validation folds. :param cv: A cross-validarion generator that determines the cross-validation strategy. :param cv_score: Measure to evaluate predictions on the validation set. """ # Setting fixed parameters params = self.fixed_params.copy() # Fix seed np.random.seed(1) tf.set_random_seed(2) # Start Gridsearch best_AUC = 0.5 for tune in ParameterGrid(self.tuning_params): params.update(tune) AUC_val = [] for train, val in cv.split(self.X_tr, self.y_tr): X_train, y_train = self.X_tr.iloc[train], self.y_tr.iloc[train] X_val, y_val = self.X_tr.iloc[val], self.y_tr.iloc[val] e_stop = EarlyStopping( monitor=params["monitor"], min_delta=params["min_delta"], patience=params["iter_patience"], mode=params["mode"], ) callbacks = [e_stop] optimizer = eval("keras.optimizers." + params["optimizer"])( lr=params["learning_rate"] ) model = Sequential() model.add( Dense( params["num_neurons"], input_dim=len(list(self.X_tr)), kernel_initializer=params["weight_init"], activation=params["hidden_activation"], kernel_regularizer=keras.regularizers.l1(params["l1_ratio"]), ) ) model.add(Dropout(params["dropout_rate"])) model.add( Dense( 1, kernel_initializer=params["weight_init"], activation=params["out_activation"], kernel_regularizer=keras.regularizers.l1(params["l1_ratio"]), ) ) model.compile(loss=params["loss_func"], optimizer=optimizer) history = model.fit( X_train, y_train, callbacks=callbacks, validation_data=(X_val, y_val), epochs=params["epochs"], batch_size=params["batch_size"], verbose=0, ) validation_AUC = calc_perf_score( data=X_val, labels=np.array(y_val.astype("float")), model=model, model_name=self.name, score_name=cv_score, ) AUC_val.append(validation_AUC) AUC_val = np.mean(AUC_val) if AUC_val > best_AUC: best_AUC = AUC_val self.best_tuning_params = tune keras.backend.clear_session()
"8,4,2", "8,2,2" ], "rnns": [ "2000,2000", "1000,1000", "1000" ], "mlps": [ # Last layer will automatically be added to map toward a q_level space "1000,1000", "2000,2000" ], "emb_size": [256], "q_levels": [256], } hparams = list(ParameterGrid(hparams)) for index, hparam in enumerate(hparams): print("Training config {:d}".format(index)) # Create folder param_folder = LOGDIR_ROOT + '/' + str(index) os.mkdir(param_folder) # Write config with open(param_folder + '/hparam.json', 'w') as fp: json.dump(hparam, fp) main_command = """python main.py \\ --batch_size=64 \\
data[key] = load_all_data( basedir=BASEDIR, class_one=CLASS_ONE, class_two=CLASS_TWO, ending=ENDING, ) else: logger.info("data was pre-built, continuing") data_train = data[key][job["data_prefix"] + "train"] data_test = data[key][job["data_prefix"] + "test"] labels_train = data[key]["labels_train"] labels_test = data[key]["labels_test"] t_hparam_grid = ParameterGrid(training_hparams) m_hparam_grid = ParameterGrid(model_hparams) logger.info("selecting {} combinations".format(nb_selections)) candidates = list(product(list(t_hparam_grid), list(m_hparam_grid))) hparam_grid = random.sample(population=candidates, k=nb_selections) # if nb_proc > 1: for count, (t_hparams, m_hparams) in enumerate(hparam_grid): logger.info("performing grid element {} of {}".format( count + 1, nb_selections)) train_caloclf_model( model_fn=model_fn,
# Read arguments. if len(sys.argv) > 1: run_from_args() else: from sklearn.model_selection import ParameterGrid # Get task grid. Edit_Cost_List = ['BIPARTITE', 'IPFP'] Dataset_list = [ 'Alkane_unlabeled', 'Acyclic', 'Chiral', 'Vitamin_D', 'Steroid' ] Dis_List = ['euclidean', 'manhattan'] task_grid = ParameterGrid({ 'edit_cost': Edit_Cost_List[0:1], 'dataset': Dataset_list[1:2], 'distance': Dis_List[:] }) unlabeled = False # @todo: Not actually used. mode = 'reg' # Run. for task in list(task_grid): print() print(task) output_result = 'outputs/results.' + '.'.join([ task['dataset'], task['edit_cost'], task['distance'] ]) + '.pkl' if not os.path.isfile(output_result): run_xp(task['dataset'], output_result, unlabeled, mode,
from sklearn.cross_validation import train_test_split #{'colsample_bytree': 0.7, 'max_bin': 255, 'seed': 2261, 'n_estimators': 8488, 'min_child_samples': 10, 'learning_rate': 0.01, 'max_depth': 10, 'boosting_type': 'gbdt', 'reg_alpha': 1, 'reg_lambda': 1, 'min_child_weight': 3, 'num_leaves': 200, 'min_split_gain': 0, 'subsample': 0.9} # x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242) all_params = { 'C': [0.1], 'n_jobs': [-1], # [0.06, 0.1, 0.2], 'solver': ['sag'], 'random_state': [2261] } min_score = (100, 100, 100) min_params = None cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) use_score = 0 for params in ParameterGrid(all_params): list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) for train, test in cv.split(x_train, y_train): trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] trn_w = sample_weight[train] val_w = sample_weight[test] clf = LogisticRegression(**params) # pred_x = cross_val_predict(reg, trn_x, trn_y, cv=5, n_jobs=-1) # trn_x = np.c_[trn_x, pred_x]
def build_a_complex_brain(): from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import ParameterGrid from sklearn.ensemble import VotingClassifier # Voting will help us win the competition! We can combine the top models together! # Models we will use: (TODO: Read the documentation and come up with better Classifiers and Parameters) from sklearn.naive_bayes import MultinomialNB from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.tree import ExtraTreeClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.neighbors import NearestCentroid from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import RidgeClassifier from sklearn.svm import NuSVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import SGDClassifier from sklearn.ensemble import AdaBoostClassifier # Models end # All the models we are going to use candidate_models = [ AdaBoostClassifier, # SGDClassifier, MultinomialNB, BernoulliNB, DecisionTreeClassifier, ExtraTreeClassifier, ExtraTreesClassifier, KNeighborsClassifier, # LinearSVC, LogisticRegression, NearestCentroid, RandomForestClassifier, RidgeClassifier, # NuSVC, GradientBoostingClassifier, ] # The parameters that could pass to model candidate_parameters = { "SGDClassifier": {"max_iter": [10, 100], "tol" : [1e-3], "loss": ["log", "modified_huber"]}, "ExtraTreesClassifier": {"n_estimators": [20, 200]}, "LinearSVC": {"max_iter": [200, 2000], "multi_class": ["crammer_singer"]}, "LogisticRegression": {"solver": ["lbfgs"], "multi_class": ["auto"]}, "RandomForestClassifier": {"n_estimators": [10,30]}, "NuSVC": {"gamma": ["scale", "auto"]}, } # Build the model brains = [] for this_model in candidate_models: if this_model.__name__ in candidate_parameters: parameter_grid = candidate_parameters[this_model.__name__] parameters = list(ParameterGrid(parameter_grid)) # make grid for parameter-pairs for i, parameter in enumerate(parameters): brain = Pipeline([ ("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", this_model(**parameter)), ]) brains.append( (f"{this_model.__name__}-{i}", brain) ) else: brain = Pipeline([ ("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", this_model()), ]) brains.append( (f"{this_model.__name__}", brain) ) brain = VotingClassifier( estimators=brains, voting="hard" ) return brain
# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242) all_params = { 'eta': [0.05], 'max_depth': [5], 'subsample': [0.7], 'colsample_bytree': [0.7], 'objective': ['reg:linear'], #'eval_metric': [['rmse', rmsel_metric]], 'silent': [1] } min_score = (100, 100, 100) min_params = None use_score = 0 cv = np.arange(x_train.shape[0]) for params in tqdm(list(ParameterGrid(all_params))): #cv = TimeSeriesSplit(n_splits=5).split(x_train) cnt = 0 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) for train, test in [[cv[:-VALID_NUM], cv[-VALID_NUM:]]]: trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] dtrain = xgb.DMatrix(trn_x, trn_y) dtest = xgb.DMatrix(val_x, val_y)
args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) # Generate a grid of hyperparameters param_grid_base = { "lr": [1e-3, 1e-4, 1e-5], "batch_size": [128, 256, 512], "drop_prob": [0.0, 0.25, 0.5, 0.75], "num_epochs": [150], "num_hidden": [1, 2, 3], "hidden_dim": [128, 256], "gamma": [1.0], "early_stopping": [True], "early_stopping_patience": [10], } the_grid = list(ParameterGrid(param_grid_base)) np.random.shuffle(the_grid) the_grid = the_grid[:args.grid_size] print(args.grid_size) for task in args.tasks: # the_grid = [{**args.__dict__, **x} for x in the_grid] grid_df = pd.DataFrame(the_grid) config_path = os.path.join(args.data_path, "experiments", args.experiment_name, "config", task) os.makedirs(config_path, exist_ok=True) grid_df.to_csv(os.path.join(config_path, "config.csv"), index_label="id") for i, config_dict in enumerate(the_grid): config_dict["label_col"] = task yaml_write(config_dict,
"draw": SP_DRAW, "print_train": SP_PRINT_TRAIN, # 0: nothing, 1 : full detail, 2: short version } paras_name = "hs_{}-ep_{}-act_{}".format(item["hidden_size"], item["epoch"], item["activation"]) root_hybrid_paras = { "hidden_size": item["hidden_size"], "activation": item["activation"], "epoch": item["epoch"], "domain_range": item["domain_range"], "paras_name": paras_name } two_paras = {"epoch": item["epoch"], "pop_size": item["pop_size"]} md = OTwoElm(root_base_paras=root_base_paras, root_hybrid_paras=root_hybrid_paras, two_paras=two_paras) md._running__() for _ in range(SP_RUN_TIMES): for loop in range(len(SP_DATA_FILENAME)): filename = SP_LOAD_DATA_FROM + SP_DATA_FILENAME[loop] dataset = load_dataset(filename, cols=SP_DATA_COLS[loop]) feature_size = len(SP_DATA_COLS[loop]) multi_output = SP_DATA_MULTI_OUTPUT[loop] output_index = SP_OUTPUT_INDEX[loop] # Create combination of params. for item in list(ParameterGrid(param_grid)): train_model(item)
listValidImage = [] for index, data in dataValid.iterrows(): strResult = str(data["result"]) strImage = str(data["id"]) + ".jpg" objectImage = pil.open("Holdout/Valid/Image/" + strResult + "/" + strImage).resize(tupleResize) arrayImage = numpy.array(objectImage) / 255 listValidImage.append(arrayImage) arrayValidImage = numpy.array(listValidImage).astype("float32") ## ## Parameter dictParameter = {} dictParameter["Batch"] = [64] dictParameter["Epoch"] = [2, 2, 2] dictParameter["Eta"] = [0.0001] dictParameter["Optimizer"] = ["Adam"] listParameter = list(ParameterGrid(dictParameter)) ## ## Result dictTuneResult = TuneResult() strResultPath = strOutputPath + str.split(str(timeit.default_timer()), ".")[1] + "/" os.makedirs(strResultPath) ## ## Model for i, p in enumerate(listParameter): ## ## Initial floatStart = timeit.default_timer() random.seed(2) numpy.random.seed(2018) tensorflow.set_random_seed(2018) os.environ['PYTHONHASHSEED'] = "1"
def create_parameter_grid(param_dict): from sklearn.model_selection import ParameterGrid return ParameterGrid(param_dict)
def _get_params(self): """Parameters to pass to `fit`. By default, a GridSearch over ``self.parameters`` is used. """ return ParameterGrid(self.parameters)
from sklearn.model_selection import ParameterGrid import tensorflow as tf from models.gans.mnist import generator_forward from models.tasks.mnist import classifier_forward from tasks.semi_classify import run_task from utils.data_utils import MNISTLoader NAME_STYLE = "pub_%(public_num)d_final_%(gen_frac_final).1f_step_%(gen_frac_step)d" num_images = 50000 GRID = ParameterGrid({ "gen_frac_init": [0.00], "gen_frac_final": [0.0, 0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0], "gen_frac_step": [0, 100, 200, 400, 500, 600, 800], "public_num": [100, 250, 500, 750, 1000], } ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("model_path", metavar="MODELPATH") parser.add_argument("--log-dir", dest="log_dir") parser.add_argument("--save-dir", dest="save_dir") parser.add_argument("--data-dir", dest="data_dir", default="/tmp/mnist") parser.add_argument("--batch-size", dest="batch_size", type=int, default=128) parser.add_argument("-e", "--num-epoch", dest="num_epoch", type=int, default=10) parser.add_argument("--dim", dest="dim", default=64, type=int) parser.add_argument("--public-num", dest="public_num", type=int, default=50000)
def basic_gridsearch(ex, grid, param_drops=None, verbose=2, handle_completed_state=True, logging='WARNING'): """Basic gridsearch for a sacred experiment. Given an experiment and a parameter grid, this will iterate over all possible combinations of parameters specified in the grid. In an iteration, the experiment configuration is updated and the experiment is run. Args: ex (sacred.Experiment): A sacred experiment. grid (dict, list): Parameter grid, analogous setup to as with sklearn gridsearches. If a list is specified then it will assume it is from a restart and continue as normal. param_drops (list): A list of functions that take one argument that acts on params and drops certain options from being run. verbose (int): Output verbosity level. handle_completed_state (bool): Set True to examine whether the parameters have already been run and additionally if that run was marked as completed. NOTE: You need to be careful with this option. It will only work if the completion state is being set at the end of an experiment run. If it is not being set then it will always delete and rerun. logging (str): The logging level. seed (int): Random state to set. Set None for a random seed, this will be stored in the run config. Examples: Param drops --------------- The param drops argument is used to remove certain parameter combinations from the parameter grid that would otherwise have been run. For example: >>> param_drops = [ >>> lambda params: (params['model_type'] in ['rnn', 'gru']) and (params['depth'] > 1) >>> ] >>> basic_gridsearch(ex, grid, param_drops=param_drops) will run the gridesarch as normal, but in any instances where model_type was 'rnn' or 'gru' and 'depth' was > 1 the gridsearch will skip these options. Returns: None """ # Set logging set_logger(ex, level=logging) # Setup the grid assert any([isinstance(grid, dict), isinstance(grid, list)]) if isinstance(grid, dict): param_grid = list(ParameterGrid(grid)) else: param_grid = grid # Perform the param drops if param_drops is not None: param_grid = [ p for p in param_grid if not any([drop_bool(p) for drop_bool in param_drops]) ] # Get num iters grid_len = len(param_grid) for i, params in tqdm(enumerate(param_grid)): # Print info if verbose > 0: print('\n\n\nCONFIGURATION {} of {}\n'.format(i + 1, grid_len) + '-' * 100) pprint(params) print('-' * 100) # Set the random seed handle_seed(params) # Skip if done if handle_completed_state: if check_run_existence(ex, params): continue # Update the ingredient configuration params = update_ingredient_params(ex, params) # Update configuration and run try: # Update with hyperopt if specified # This is in the try except in case hyperparams cannot be found, error is caught and printed if 'hyperopt_metric' in params: metric = params['hyperopt_metric'] assert isinstance(metric, str), "Hyperopt metric must be a string." hyperparams = get_hyperparams(ex, params['model_type'], metric=metric) params.update(hyperparams) # Run and mark completed if successful ex.run(config_updates=params, info={}) save_pickle( True, ex.current_run.save_dir + '/other/completion_state.pkl') except Exception as e: handle_error(ex.current_run, e, print_error=True)
activation = ['logistic', 'tanh', 'relu'] solver = ['lbfgs', 'sgd', 'adam'] alpha = [x for x in np.linspace(0.0001, 1, num=1000)] max_iter = [5000] grid = { 'hidden_layer_sizes': numLayers, 'activation': activation, 'solver': solver, 'alpha': alpha, 'max_iter': max_iter } print('Searching') randomCombinations = random.sample(list(ParameterGrid(grid)), numSamples) score_list = [] combination_list = [] train_acc_list = [] test_acc_list = [] for combination in randomCombinations: skf = StratifiedKFold(n_splits=cvCount, random_state=seed, shuffle=True) s = 0 tr_acc = 0 te_acc = 0 for train_idx, test_idx in skf.split(X_train, Y_train): split_x_train, split_x_test = X_train[train_idx], X_train[test_idx] y_true_train, y_true_test = Y_train[train_idx], Y_train[test_idx] mlp = MLPClassifier(**combination)
# 'custom_action_dist': 'gaussian_copula_action_distribution', 'custom_options': { 'num_layers': 3, 'layer_size': 64, 'activation': 'relu', 'reward_to_go': True, 'use_vf_adv': True } } } param_grid_spec = { 'env': ['Reverse-v0', 'Copy-v0', 'ReversedAddition3-v0'] # 'env_config.env_name': ['Reverse-v0', 'Copy-v0', 'RepeatCopy-v0', 'DuplicatedInput-v0', 'ReversedAddition3-v0'] } param_grid = list(ParameterGrid(param_grid_spec)) for params in param_grid: exp_config = merge_configs(base_config, params) env_name = exp_config['env'] exp_name = 'ppo_baseline_{}_{}'.format(model_name, env_name) export_dir = '{}/{}'.format(base_export_dir, exp_name) run_experiment_repeatedly(exp_name, 'PPO', num_iter=num_iter, base_export_dir=export_dir, config=exp_config, seeds=seeds) # run_experiment(exp_name, PGCopulaTrainer, num_iter=100, export_dir=export_dir, # config=config) # results = tune.run( # PGCopulaTrainer,
def main(runname, expstatslog, mlflowlog, earlystop): if mlflowlog: pass else: global mlflow mlflow = dumbflow() if expstatslog: exp_status_write = open(EXP_STATUS, "a") else: exp_status_write = sys.stdout exp_status_write.write("\n\n\n\n") exp_status_write.write("--------------------------") exp_status_write.write(" BEGINNING NEW EXECUTION (" + runname + ") AT " + str(time_utils.readable_time("%Y-%m-%d %H:%M:%S"))) exp_status_write.write(" ------------------------" + "\n\n") # We are tracking drift adaptivity # namely labeled drift detection # Set up explicit drift detection params explicit_drift_param_grid = { "allow_explicit_drift": [(True, "ExpDr")], "explicit_drift_class": [("LabeledDriftDetector", "LDD")], "explicit_drift_mode": [("PageHinkley", "PageHinkley"), ("ADWIN", "ADWIN"), ("EDDM", "EDDM"), ("DDM", "DDM")], "explicit_update_mode": [("all", "A"), ("errors", "E")], "allow_unlabeled_drift": [(False, "")], "allow_update_schedule": [(False, "")], "weight_method": [("unweighted", "U"), ("performance", "P")], "select_method": [("recent", "RR"), ("recent-new", "RN"), ("recent-updates", "RU")], "filter_method": [("no-filter", "F"), ("top-k", "T"), ("nearest", "N")], "kval": [(5, "5"), (10, "10")] } explicit_drift_params = ParameterGrid(explicit_drift_param_grid) for param_set in explicit_drift_params: # This is an experiment if param_set["explicit_update_mode"][0] == "all": continue # Load up configuration file mlepConfig = io_utils.load_json('./MLEPServer.json') # Update config file and generate an experiment name experiment_name = '' for _param in param_set: if param_set[_param][1] != "": experiment_name += param_set[_param][1] + '-' mlepConfig["config"][_param] = param_set[_param][0] experiment_name = experiment_name[:-1] # Now we have the Experimental Coonfig we can use for running an experiment # generate an experiment name exp_status_write.write("--STATUS-- " + experiment_name + " ") exp_status_write.flush() try: runExperiment(runname, mlepConfig, experiment_name, expstatslog, earlystop) exp_status_write.write("SUCCESS\n") except Exception as e: exp_status_write.write("FAILED\n") exp_status_write.write(traceback.format_exc()) exp_status_write.write(str(e)) exp_status_write.write("\n") exp_status_write.flush() mlflow.end_run() exp_status_write.flush() exp_status_write.write("\n\n") exp_status_write.write("--------------------------") exp_status_write.write(" FINISHED EXECUTION OF (" + runname + ") AT " + str(time_utils.readable_time("%Y-%m-%d %H:%M:%S"))) exp_status_write.write(" ------------------------" + "\n\n") exp_status_write.close()