def _create_estimator_random_classifier(classifier=any_classifier('my_clf' ), preprocessing=any_preprocessing( 'my_pre'), max_evals=100, trial_timeout=120, seed=None, algo=tpe.suggest): """ :param classifier: :param preprocessing: :param max_evals: :param trial_timeout: :param seed: :param algo: :return: """ estim = HyperoptEstimator(classifier=classifier, preprocessing=preprocessing, algo=algo, max_evals=max_evals, trial_timeout=trial_timeout, ex_preprocs=None, regressor=None, space=None, loss_fn=None, continuous_loss_fn=False, verbose=False, fit_increment=1, fit_increment_dump_filename=None, seed=seed, use_partial_fit=False, refit=True) return estim
def main(): for dataset in [ 'DataClass.csv', 'FeatureEnvy.csv', 'GodClass.csv', 'LongMethod.csv' ]: sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(f'../../../Dodge/data/smell/{dataset}', target='SMELLS', col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: continue
def main(): for dataset in ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF']: sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w') for i in range(10): try: print(f'Running {dataset}') print('=' * 20) data = TextDataLoader.from_file( f'../../../Dodge/data/textmining/{dataset}.txt') a = time.time() estim = HyperoptEstimator( classifier=any_classifier('clf'), preprocessing=any_text_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: continue
def anySample1(): # Download the data and split into training and test sets iris = load_iris() X = iris.data y = iris.target # train and test 的划分 test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] any_preprocessing = None # Instantiate a HyperoptEstimator with the search space and number of evaluations estim = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit( X_train, y_train ) # Show the results print( estim.score( X_test, y_test ) ) # 1.0 print( estim.best_model() )
def main(): for dataset in glob.glob('../../../Dodge/data/UCI/*.csv'): df = pd.read_csv(dataset) target = df.columns[-1] sys.stdout = open(f'./hyperopt-log/{dataset.split("/")[-1]}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(dataset, target=target, col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: raise continue
def test_hyperopt(): # Load data featuren = 1406 dir_key = '1406' data_key = '850+556' dir_path = dir_path_dict[dir_key] data_str = dir_path + data_str_dict[data_key] # stdout_path = 'outcome_hyperopt_svc.moreinfo1.txt' # print '[INFO] stdout_path:\t{}'.format(stdout_path) # sys.stdout = open(stdout_path, 'w') print "[INFO] params:\tclassifier=svc_linear('mySVC'), algo=tpe.suggest, preprocessing=[standard_scaler('std_scl')]" scores = [] sensis = [] specis = [] for i in range(10): # Load data data_path = data_str.format(i + 1) print data_path trainset, testset = get_dataset(data_path=data_path, foldi=i + 1, featuren=featuren) train_data, train_label = trainset test_data, test_label = testset # Create the estimator object estim = hyperopt_estimator(classifier=any_classifier('mySVC'), algo=tpe.suggest, preprocessing=[standard_scaler('std_scl')], seed=RANDOM_SEED) # Search the space of classifiers and preprocessing steps and their # respective hyperparameters in sklearn to fit a model to the data estim.fit(train_data, train_label) # show instances of the best classifier model = estim.best_model() print model # Make a prediction using the optimized model prediction = estim.predict(test_data) error = np.count_nonzero(prediction - test_label) / test_data.shape[0] sensi, speci = my_scores(test_label, prediction) print 1 - error, sensi, speci # Report the accuracy of the classifier on a given set of data score = estim.score(test_data, test_label) print score scores.append(score) sensis.append(sensi) specis.append(speci) print scores print "accur:\t{}\tstd:\t{}".format(np.mean(scores), np.std(scores)) print "sensi:\t{}".format(np.mean(sensis)) print "speci:\t{}".format(np.mean(specis))
def select_best_model(self, max_evals=100, trial_timeout=120): if self.train_x is None or self.test_x is None or self.train_y is None or self.test_y is None: self.__train_val_split__() estim = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing={}, algo=tpe.suggest, max_evals=max_evals, trial_timeout=trial_timeout) estim.fit(self.train_x.values, self.train_y.values) print(estim.score(self.test_x, self.test_y.values)) return estim
def compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, timeout): estim = HyperoptEstimator(classifier=any_classifier('clf'), algo=tpe.suggest, max_evals=60, trial_timeout=timeout / 60) best = -1 try: estim.fit(X_train, y_train) best = estim.score(X_test, y_test) print(estim.best_model()) except: best = -1 return best
def hyper_bot(self): """ print accuracy :return: None """ model = HyperoptEstimator( classifier=any_classifier("cla"), preprocessing=any_preprocessing("pre"), algo=tpe.suggest, max_evals=20, trial_timeout=30, ) model.fit(self.x_train, self.y_train) accuracy = model.score(self.x_test, self.x_train) print(f"Accuray: {accuracy}")
def train_hypsklearn(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): modelname = common_name_model + '.pickle' files = list() if mtype in [' classification', 'c']: estim = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) elif mtype in ['regression', 'r']: estim = HyperoptEstimator(classifier=any_regressor('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) # Search the hyperparameter space based on the data estim.fit(X_train, y_train) # Show the results print(estim.score(X_test, y_test)) print(estim.best_model()) scores = estim.score(X_test, y_test) bestmodel = str(estim.best_model()) print('saving classifier to disk') f = open(modelname, 'wb') pickle.dump(estim, f) f.close() files.append(modelname) modeldir = os.getcwd() return modelname, modeldir, files
def main(): file_dic = {"ivy": ["ivy-1.4.csv", "ivy-2.0.csv"], "lucene": ["lucene-2.0.csv", "lucene-2.2.csv"], "lucene2": ["lucene-2.2.csv", "lucene-2.4.csv"], "poi": ["poi-1.5.csv", "poi-2.5.csv"], "poi2": ["poi-2.5.csv", "poi-3.0.csv"], "synapse": ["synapse-1.0.csv", "synapse-1.1.csv"], "synapse2": ["synapse-1.1.csv", "synapse-1.2.csv"], "camel": ["camel-1.2.csv", "camel-1.4.csv"], "camel2": ["camel-1.4.csv", "camel-1.6.csv"], "xerces": ["xerces-1.2.csv", "xerces-1.3.csv"], "jedit": ["jedit-3.2.csv", "jedit-4.0.csv"], "jedit2": ["jedit-4.0.csv", "jedit-4.1.csv"], "log4j": ["log4j-1.0.csv", "log4j-1.1.csv"], "xalan": ["xalan-2.4.csv", "xalan-2.5.csv"] } for dataset in file_dic: sys.stdout = open(f'./hyperopt-log/{dat}.txt', 'w') print(f'Running {dat}') print('=' * 20) data = DataLoader.from_files( base_path='./issue_close_time/', files=file_dic[dataset]) try: a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing( 'pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b-a, 'seconds.') except: continue
def main(): directories = [ "1 day", "7 days", "14 days", "30 days", "90 days", "180 days", "365 days" ] datasets = [ "camel", "cloudstack", "cocoon", "hadoop", "deeplearning", "hive", "node", "ofbiz", "qpid" ] for dat in datasets: for time_ in directories: sys.stdout = open(f'./hyperopt-log/{dat}-{time_}.txt', 'w') print(f'Running {dat}-{time_}') print('=' * 30) data = DataLoader.from_file( "/Users/ryedida/PycharmProjects/raise-package/issue_close_time/" + time_ + "/" + dat + ".csv", target="timeOpen", col_start=0) try: a = time.time() estim = HyperoptEstimator( classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=partial(loss, dat, time_), trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except ValueError: continue except: continue
in the documentation ( http://hyperopt.github.io/hyperopt-sklearn/ ) returns the error: "ConnectionResetError: [Errno 54] Connection reset by peer" """ # Download the data and split into training and test sets digits = fetch_mldata('MNIST original') X = digits.data y = digits.target test_size = int(0.2 * len(y)) np.random.seed(13) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] estim = HyperoptEstimator(classifier=any_classifier('clf'), algo=tpe.suggest, trial_timeout=300) estim.fit(X_train, y_train) print(estim.score(X_test, y_test)) # <<show score here>> print(estim.best_model()) # <<show model here>>
# define dataset X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = HyperoptEstimator(classifier=any_classifier("cla"), preprocessing=any_preprocessing("pre"), algo=tpe.suggest, max_evals=50, trial_timeout=30) # perform the search model.fit(X_train, y_train) # summarize performance accuracy = model.score(X_test, y_test) print(f"Accuracy: {accuracy}") # summarize the best model print(model.best_model)
def run(dataset, config): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred) auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False), f1=(lambda y, pred: 1.0 - f1_score(y, pred), False), # logloss=(log_loss, True), mae=(mean_absolute_error, False), mse=(mean_squared_error, False), msle=(mean_squared_log_error, False), r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) rmse=(mean_squared_error, False), ) loss_fn, continuous_loss_fn = metrics_to_loss_mapping[ config.metric] if config.metric in metrics_to_loss_mapping else (None, False) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') if loss_fn is default: loss_fn = None training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } log.warning("Ignoring cores constraint of %s cores.", config.cores) log.info( "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.", config.max_runtime_seconds, 'all', config.metric) X_train = dataset.train.X_enc y_train = dataset.train.y_enc if is_classification: classifier = any_classifier('clf') regressor = None else: classifier = None regressor = any_regressor('rgr') estimator = HyperoptEstimator(classifier=classifier, regressor=regressor, algo=tpe.suggest, loss_fn=loss_fn, continuous_loss_fn=continuous_loss_fn, trial_timeout=config.max_runtime_seconds, seed=config.seed, **training_params) with InterruptTimeout(config.max_runtime_seconds * 4 / 3, sig=signal.SIGQUIT): with InterruptTimeout(config.max_runtime_seconds, before_interrupt=ft.partial( kill_proc_tree, timeout=5, include_parent=False)): with Timer() as training: estimator.fit(X_train, y_train) log.info('Predicting on the test set.') X_test = dataset.test.X_enc y_test = dataset.test.y_enc predictions = estimator.predict(X_test) if is_classification: probabilities = "predictions" # encoding is handled by caller in `__init__.py` else: probabilities = None return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=len(estimator.trials), training_duration=training.duration)
def main(): # Construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-p", "--path", default='nailgun', help="path to nailgun folder") ap.add_argument("-m", "--model", required= True, help="name of the model file to save the model") ap.add_argument("-cs", "--csize", default=80, help="paramter to crop the image around the nailgun") ap.add_argument("-ex", "--ext", type=str, default='.jpeg', help="extension of the images") args = vars(ap.parse_args()) # Load paramters crop_size = args['csize'] path_to_images = args['path'] filename = args['model'] ext = args['ext'] split_factor = 0.75 # List all of the images paths, labels = list_images(path_to_images, ext) # Get paths correctly distibuted good/bad n_paths = distribute_paths(paths) # Split and generate labels (x_train_paths, y_train_str), (x_test_paths, y_test_str) = split_and_get_labels(n_paths, split_factor) print('--- Split ---') print('Train: '+str(len(x_train_paths))+', Test: '+str(len(x_test_paths))) # Load object for label binarizer lb = LabelBinarizer() lb.fit(y_train_str) n_feats = crop_size**2 + 2 x_train = np.zeros((len(x_train_paths), n_feats), np.uint8) y_train = np.zeros((len(y_train_str), 1), np.int32) print('---- Extracting Train samples ----') progress = tqdm.tqdm(total=len(x_train_paths)) for idx, path in enumerate(x_train_paths): x_train[idx, :] = extract_nail(path) y_train[idx] = lb.transform([path.split("_")[-1].split(".")[0]]) progress.update(1) y_train = np.ravel(y_train) print('---- Extracting Test samples ----') progress = tqdm.tqdm(total=len(x_test_paths)) x_test = np.zeros((len(x_test_paths), n_feats), np.float) y_test = np.zeros((len(y_test_str), 1), np.int32) for idx, path in enumerate(x_test_paths): x_test[idx, :] = extract_nail(path) y_test[idx] = lb.transform([path.split("_")[-1].split(".")[0]]) progress.update(1) y_test = np.ravel(y_test) # Define HyperoptEstimator estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pp'), algo=tpe.suggest, trial_timeout=30) estim.fit(x_train, y_train) print('---- BEST SCORE (acc) ----') print( estim.score( x_test, y_test ) ) print('---- BEST MODEL ----') print( estim.best_model() ) pkl_filename = 'model/'+filename+'.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(estim.best_model(), file) print('--- Correctly saved! ---')
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** Hyperopt-sklearn ****\n") is_classification = config.type == 'classification' default = lambda: 0 metrics_to_loss_mapping = dict( acc=(default, False), # lambda y, pred: 1.0 - accuracy_score(y, pred) auc=(lambda y, pred: 1.0 - roc_auc_score(y, pred), False), f1=(lambda y, pred: 1.0 - f1_score(y, pred), False), # logloss=(log_loss, True), mae=(mean_absolute_error, False), mse=(mean_squared_error, False), msle=(mean_squared_log_error, False), r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) ) loss_fn, continuous_loss_fn = metrics_to_loss_mapping[ config.metric] if config.metric in metrics_to_loss_mapping else (None, False) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') if loss_fn is default: loss_fn = None log.warning("Ignoring cores constraint of %s cores.", config.cores) log.info( "Running hyperopt-sklearn with a maximum time of %ss on %s cores, optimizing %s.", config.max_runtime_seconds, 'all', config.metric) X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc) y_train, y_test = dataset.train.y_enc, dataset.test.y_enc if is_classification: classifier = any_classifier('clf') regressor = None else: classifier = None regressor = any_regressor('rgr') estimator = HyperoptEstimator(classifier=classifier, regressor=regressor, algo=tpe.suggest, loss_fn=loss_fn, continuous_loss_fn=continuous_loss_fn, trial_timeout=config.max_runtime_seconds, seed=config.seed, **config.framework_params) with InterruptTimeout(config.max_runtime_seconds * 4 / 3, sig=signal.SIGQUIT): with InterruptTimeout(config.max_runtime_seconds, before_interrupt=ft.partial( kill_proc_tree, timeout=5, include_parent=False)): with Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) probabilities = Encoder('one-hot', target=False, encoded_type=float).fit_transform( predictions) if is_classification else None save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=True) return dict(models_count=len(estimator.trials), training_duration=training.duration)
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment) if ',' in scoring: scoring = scoring.split(',') # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias if scoring == 'precision': scoring = make_scorer(precision_score, zero_division=1) os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup( filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model' ) index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) experiment_index = {} if n_jobs == 'auto': n_jobs = os.cpu_count() # Load parameter grid argument if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset)) for _sym, data in dataset_index.items(): logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) current_target = p.TARGET if not use_target else use_target # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] features = features.replace([np.inf, -np.inf], np.nan) imputer = SimpleImputer() imputer.fit(features.values) feat_imp_values = imputer.transform(features.values) features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns) X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size) # Summarize distribution logger.info("Start Hyperopt search") if expanding_window: cv = TimeSeriesSplit(n_splits=expanding_window) #cv = sliding_window_split(X_train, 0.1) est = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) est.fit(X_train, y_train) logger.info("End Hyperopt search") # Take the fitted ensemble with tuned hyperparameters clf = est.best_model()['learner'] best_score = est.score(X_train, y_train) best_params = {} # Plot learning curve for the classifier #est = p.estimator #est.set_params(**best_params) _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True) #plt.tight_layout() _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ] #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv) axes[1][0].set_title("{} - ROC (Train)".format(_sym)) plot_roc_curve(clf, X_train, y_train, ax=axes[1][0]) axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1]) axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym)) plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2]) axes[2][0].set_title("{} - ROC (Test)".format(_sym)) plot_roc_curve(clf, X_test, y_test, ax=axes[2][0]) axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1]) axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym)) plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2]) curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym) plt.savefig(curve_path) plt.close() # Test ensemble's performance on training and test sets predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1))) predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2))) report = { 'training_set': { 'features':X_train.shape[1], 'records':X_train.shape[0], 'class_distribution': get_class_distribution(y_train), 'classification_report': train_report, 'accuracy': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'precision': precision_score(y_train, predictions1), 'recall': recall_score(y_train, predictions1), 'f1': f1_score(y_train, predictions1), 'y_true':[y for y in y_train], 'y_pred':[y for y in predictions1] }, 'test_set': { 'features':X_test.shape[1], 'records':X_test.shape[0], 'class_distribution':get_class_distribution(y_test), 'classification_report': test_report, 'accuracy': accuracy_score(y_test, predictions2), 'precision': precision_score(y_test, predictions2), 'mse': mean_squared_error(y_test, predictions2), 'recall': recall_score(y_test, predictions2), 'f1': f1_score(y_test, predictions2), 'y_true': [y for y in y_test], 'y_pred': [y for y in predictions2] } } # If the classifier has a feature_importances attribute, save it in the report feature_importances = None if hasattr(clf, 'feature_importances_'): feature_importances = clf.feature_importances_ elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'): feature_importances = clf.named_steps.c.feature_importances_ if feature_importances is not None: importances = {features.columns[i]: v for i, v in enumerate(feature_importances)} labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])} report['feature_importances'] = labeled if hasattr(clf, 'ranking_'): report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)} if hasattr(clf, 'support_'): report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s] train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()] test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()] logger.info('Model evaluation: \n' '== Training set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ '== Test set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist), report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'], report['training_set']['recall'], report['training_set']['f1'], X_test.shape[1], X_test.shape[0], '\n'.join(test_dist), report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'], report['test_set']['recall'], report['test_set']['f1'] ) ) # Save a pickle dump of the model model_path = '{}{}.p'.format(models_dir, _sym) with open(model_path, 'wb') as f: pickle.dump(clf, f) # Save the model's parameters params_path = '{}{}_parameters.json'.format(models_dir, _sym) with open(params_path, 'w') as f: json.dump(best_params, f, indent=4) # Save the report for this model report_path = '{}{}.json'.format(reports_dir, _sym) with open(report_path, 'w') as f: json.dump(report, f, indent=4) # Update the experiment's index with the new results, and save it experiment_index[_sym] = { 'model':model_path, 'params':params_path, 'report':report_path } with open(experiment_index_file, 'w') as f: json.dump(experiment_index, f, indent=4) logger.info("--- {} end ---".format(_sym)) return experiment_index
import time import numpy as np from sklearn.datasets import load_digits from sklearn.svm import SVC from hyperopt import tpe from hpsklearn import HyperoptEstimator, any_classifier from hpsklearn import svc digits = load_digits() X = digits.data y = digits.target test_size = int(0.2*len(y)) np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[indices[:-test_size]] y_train = y[indices[:-test_size]] X_test = X[indices[-test_size:]] y_test = y[indices[-test_size:]] estim = HyperoptEstimator(classifier=any_classifier('clf'),algo=tpe.suggest, seed=0) estim.fit(X_train,y_train) print(estim.score(X_test,y_test)) print(estim.best_model())
# + [markdown] heading_collapsed=true # ### Find Best Algorithm with Best Params Using HyperoptEstimator AutoML # + hidden=true preproc = hp.choice('myprepros_name', [[min_max_scaler('myprepros_name.norm')], [standard_scaler('myprepros_name.std_scaler')], [ min_max_scaler('myprepros_name.norm2'), standard_scaler('myprepros_name.std_scaler2') ]]) # + hidden=true #with mlflow.start_run(): model = HyperoptEstimator( classifier=any_classifier('cla'), preprocessing=preproc, #any_preprocessing('pre'), algo=tpe.suggest, max_evals=50, trial_timeout=5000) # perform the search model.fit(X_train, y_train) accuracy = model.score(X_test, y_test) #mlflow.log_params(params) #mlflow.log_metric('accuracy', accuracy) # Logging training data #mlflow.log_artifact(local_path = '../Data/higgs_boson_training.csv') # Logging training code
s = random_forest('clf' + '.random_forest'), params_regressor = { 'regressor': None, 'preprocessing': None, 'max_evals': 15, 'trial_timeout': 100, 'seed': 1 } params_classifier = { 'classifier': s, 'preprocessing': None, 'max_evals': 15, 'trial_timeout': 100, 'seed': 1 } s2 = any_classifier('te') print(1) # estimator = ModelBuilder.create_estimator(params_regressor) # dataset_dict = test_dataset() # m = Models(params_classifier, dataset_dict) # # print(m.fit_and_return(verbose_debug=False)) print(0) # print(create_estimator(test_dataset()))
f.write('dataset: {}, Auto-sklearn acc: {} ({}), runtime: {}'.format(dataset, acc_mean, acc_std, runtime)) filename = "autosklearn_{}.file".format(dataset) with open(filename, "wb") as f: np.save(f, np.array(acc_all)) ### TPE/Hyperopt-sklearn ### if method == "hpsklearn": from hpsklearn import HyperoptEstimator, any_classifier if dataset == "all": for dataset in datasets_all: time_all_models, time_each_model = get_required_time(dataset, n_run) hpsklearn_start_time = timeit.default_timer() acc_all = [] for run in range(n_run): X_train, y_train, X_test, y_test = test_functions.auto_ml.gen_train_test_data(dataset, seed=run) hyperopt = HyperoptEstimator(classifier=any_classifier("clf"), algo=tpe.suggest, max_evals=budget, preprocessing=[], trial_timeout=time_each_model) hyperopt.fit(X_train, y_train) y_pred = hyperopt.predict(X_test) auc = accuracy_score(y_test, y_pred) acc_all.append(auc) print("run: {}, acc: {}".format(run, round(auc, 4))) hpsklearn_end_time = timeit.default_timer() acc_mean = round(np.mean(acc_all), 4) acc_std = round(np.std(acc_all) / np.sqrt(n_run), 4) runtime = round(hpsklearn_end_time - hpsklearn_start_time, 2) print("Hyperopt-sklearn acc: {} ({})".format(acc_mean, acc_std)) print("Hyperopt-sklearn runtime: {}(s)".format(runtime)) # save result to file with open('hpsklearn_result_{}.txt'.format(dataset), 'w') as f: f.write('dataset: {}, Hyperopt-sklearn acc: {} ({}), runtime: {}'.format(dataset, acc_mean, acc_std, runtime))