def main(): args = parse_args() print("CLI: Processing input") if not os.path.exists(args.input_file.lower()): raise FileNotFoundError(args.input_file) if args.input_file.lower().split(".")[-1] not in ["csv", "arff"]: raise ValueError("Unknown file extension. Please use csv or arff.") kwargs = {} if args.input_file.lower().endswith(".csv") and args.separator is not None: kwargs["sep"] = args.seperator x, y = X_y_from_file( file_path=args.input_file.lower(), split_column=args.target, **kwargs, ) if args.mode is None: if is_categorical_dtype(y.dtype): args.mode = "classification" else: args.mode = "regression" print(f"Detected a {args.mode} problem.") print("CLI: Initializing GAMA") log_level = logging.INFO if args.verbose else logging.WARNING configuration = dict( regularize_length=args.prefer_short, max_total_time=args.time_limit_m * 60, max_eval_time=args.max_eval_time_m * 60, n_jobs=args.n_jobs, verbosity=log_level, output_directory=args.outdir, store="nothing" if args.dry_run else "logs", ) if args.metric: configuration["scoring"] = args.metric if args.mode == "regression": automl = GamaRegressor(**configuration) elif args.mode == "classification": automl = GamaClassifier(**configuration) else: raise ValueError(f"Mode {args.mode} is not valid (--mode).") if not args.dry_run: print("CLI: Starting model search") automl.fit(x, y) # == Model Export === print("CLI: Exporting models.") with open(args.output_file, "wb") as fh: pickle.dump(automl.model, fh) if args.export_python is not None: automl.export_script(args.export_python, raise_if_exists=False) else: automl.cleanup("all") print("done!")
def prepare_df(log_path, filename_class, filename_regr): ''' Executes the transformation from Gama log to df for all the logs in a path. Parameters: ----------- log_path: string Contains name of the path where the logs are stored. filename_class: string Contains the name for the csv file of the classification tasks. filename_regr: string Contains the name for the csv file of the regression tasks. Returns: -------- str Contains a confirmation that the preparation of the dataframes was executed. ''' classification, regression, clustering = get_dataset_ids(10000000) df_class, df_regr = log_to_df(path, classification, regression) df_class = df_class.reset_index(drop=True) df_regr = df_regr.reset_index(drop=True) automl_class = GamaClassifier(scoring='accuracy') automl_regr = GamaRegressor(scoring='r2') df_class = children_to_components(df_class, automl_class) df_regr = children_to_components(df_regr, automl_regr) df_class.to_csv(filename_class, index=False, sep=';') df_regr.to_csv(filename_regr, index=False, sep=';') return "Prepared the dataframes."
def test_full_system_multi_core(): automl = GamaClassifier( random_state=0, max_total_time=60, max_memory_mb=4_000, store="nothing", n_jobs=2, ) _gama_on_digits(automl)
def main(): args = parse_args() print('CLI: Processing input') if args.input_file.lower().endswith('.csv'): raise NotImplementedError("CSV currently not supported.") # data = pd.read_csv(args.input_file, sep=args.separator) if args.input_file.lower().endswith('.arff') and args.mode is None: attributes = load_feature_metadata_from_arff(args.input_file) target = list(attributes)[-1] if args.target is None else args.target target_type = attributes[target] if '{' in target_type: # Nominal features are denoted by listen all their values, eg. {VALUE_1, VALUE_2, ...} args.mode = 'classification' elif target_type.lower() == 'real': args.mode = 'regression' else: raise ValueError( f"Target column {target} has type {target_type}, which GAMA can't model." ) print('CLI: Initializing GAMA') log_level = logging.INFO if args.verbose else logging.WARNING configuration = dict(regularize_length=args.prefer_short, max_total_time=args.time_limit_m * 60, max_eval_time=args.max_eval_time_m * 60, n_jobs=args.n_jobs, verbosity=log_level, keep_analysis_log=args.logpath) if args.metric: configuration['scoring'] = args.metric if args.mode == 'regression': automl = GamaRegressor(**configuration) elif args.mode == 'classification': automl = GamaClassifier(**configuration) else: raise ValueError(f"Mode {args.mode} is not valid (--mode).") print('CLI: Starting model search') if args.input_file.lower().endswith('.arff'): automl.fit_arff(args.input_file.lower(), target_column=args.target) #else: # automl.fit(x, y) # == Model Export === print('CLI: Exporting models.') with open(args.output_file, 'wb') as fh: pickle.dump(automl.model, fh) if args.export_python is not None: automl.export_script(args.export_python, raise_if_exists=False) print('done!')
def opset(): gc = GamaClassifier(config=clf_config, scoring="accuracy") return gc._operator_set
def pset(): gc = GamaClassifier(config=clf_config, scoring="accuracy", store="nothing") yield gc._pset gc.cleanup("all")
def gamaclassifier(): return GamaClassifier(random_state=0, max_total_time=60)
cat_vars_index.append(-1) df.iloc[:, -1].replace(0, 2, inplace=True) #Divide into equal sets of data ~20,000 samples B = np.array_split(df, n) B[0] # In[6]: #Initialization cls = GamaClassifier(max_total_time=3600, keep_analysis_log=None, n_jobs=1, scoring='accuracy', post_processing_method=EnsemblePostProcessing()) X = B[0].iloc[:, 0:-1] y = B[0].iloc[:, -1] print("Starting `fit`") cls.fit(X, y) anytime_model = cls #Prequential evaluation for i in range(1, n):
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss, accuracy_score from gama import GamaClassifier if __name__ == '__main__': X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) automl = GamaClassifier(max_total_time=180, keep_analysis_log=None, n_jobs=1) print("Starting `fit` which will take roughly 3 minutes.") automl.fit(X_train, y_train) label_predictions = automl.predict(X_test) probability_predictions = automl.predict_proba(X_test) print('accuracy:', accuracy_score(y_test, label_predictions)) print('log loss:', log_loss(y_test, probability_predictions))
def execute_recommendations(X, y, cat_ind, recommendations, task, n_jobs=1): ''' Executes the recommendations made by the nearest neighbor model based on a learning task and sets the number of jobs to n_jobs for the estimators and preprocessing algorithms. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. cat_ind: list Contains boolean values to determine whether a column is categorical or not based. recommendations: list Contains the list with the recommendations made by the nearest neighbor model. task: str Contains the learning task (i.e. "classification" or "regression") n_jobs: int Contains what to set the number of jobs at for the estimators and preprocessing algorithms available in the recommended pipelines. Returns: -------- list Contains scores of each pipeline run on X and y. ''' categorical, numeric, string = category_numeric_or_string(X, cat_ind) if task.lower() == "classification": gama = GamaClassifier(scoring='accuracy') elif task.lower() == "regression": gama = GamaRegressor(scoring='r2') else: return "{} is not implemented, please try 'classification' or 'regression'".format( task) scores = [] for recommendation in recommendations: pipeline, k, did = recommendation ind = Individual.from_string(pipeline, gama._pset) X_pipe = deepcopy(X) y_pipe = deepcopy(y) X_pipe, y_pipe = onehot_or_targ(X_pipe, y_pipe, categorical, k) pipeline = [eval(p.str_nonrecursive) for p in ind.primitives] pipeline.reverse() try: for component in pipeline: if pipeline.index(component) == len(pipeline) - 1: try: setattr(component, 'n_jobs', n_jobs) except: pass X_train, X_test, y_train, y_test = train_test_split( X_pipe, y_pipe, test_size=0.30, random_state=42) cv_scores = cross_val_score(component, X_pipe, y_pipe, cv=10) score = sum(cv_scores) / 10 #component.fit(X_train, y_train) #score = component.score(X_test, y_test) scores.append(score) else: if isinstance(component, SelectPercentile) | isinstance( component, SelectFwe): X_pipe = component.fit_transform(X_pipe, y_pipe) else: X_pipe = component.fit_transform(X_pipe) except: scores.append(0) return scores
def train_gama(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): model_name = common_name_model + '.pickle' files = list() if mtype in ['c']: automl = GamaClassifier(max_total_time=180, keep_analysis_log=None) print( "Starting GAMA `fit` - usually takes around 3 minutes but can take longer for large datasets" ) automl.fit(X_train, y_train) label_predictions = automl.predict(X_test) probability_predictions = automl.predict_proba(X_test) accuracy = accuracy_score(y_test, label_predictions) log_loss_pred = log_loss(y_test, probability_predictions) log_loss_score = automl.score(X_test, y_test) print('accuracy:', accuracy) print('log loss pred:', log_loss_pred) print('log_loss_score', log_loss_score) elif mtype in ['regression', 'r']: automl = GamaRegressor(max_total_time=180, keep_analysis_log=None, n_jobs=1) print( "Starting GAMA `fit` - usually takes around 3 minutes but can take longer for large datasets" ) automl.fit(X_train, y_train) predictions = automl.predict(X_test) mse_error = mean_squared_error(y_test, predictions) print("MSE:", mse_error) # SAVE ML MODEL modelfile = open(model_name, 'wb') pickle.dump(automl, modelfile) modelfile.close() files.append(model_name) model_dir = os.getcwd() return model_name, model_dir, files
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss, accuracy_score from gama import GamaClassifier if __name__ == "__main__": X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) automl = GamaClassifier(max_total_time=180, store="nothing", n_jobs=1) print("Starting `fit` which will take roughly 3 minutes.") automl.fit(X_train, y_train) label_predictions = automl.predict(X_test) probability_predictions = automl.predict_proba(X_test) print("accuracy:", accuracy_score(y_test, label_predictions)) print("log loss:", log_loss(y_test, probability_predictions))
-------- pd.DataFrame Contains a pd.DataFrame for that specific log. ''' report = GamaReport(logfile=log_file) return report.evaluations if __name__ == "__main__": #single example: log_to_df_file("../data/ex3/a411.log") #multiple example: classification, regression, clustering = get_dataset_ids(10000000) load_path = '../data/ex3/*.log' filename_class = '../data/ex3/testc.csv' filename_regr = '../data/ex3/testr.csv' df_class, df_regr = log_to_df(load_path, classification, regression) print(df_class, df_regr) df_class = df_class.reset_index(drop=True) df_regr = df_regr.reset_index(drop=True) automl_regr = GamaRegressor(scoring='r2') automl_class = GamaClassifier(scoring='accuracy') children_to_components(df_class, automl_class).to_csv(filename_class, index=False) children_to_components(df_regr, automl_regr).to_csv(filename_regr, index=False)
def _test_dataset_problem(data, metric: str, arff: bool = False, y_type: Type = pd.DataFrame, search: BaseSearch = AsyncEA(), missing_values: bool = False, max_time: int = 60): """ :param data: :param metric: :param arff: :param y_type: pd.DataFrame, pd.Series, np.ndarray or str :return: """ gama = GamaClassifier( random_state=0, max_total_time=max_time, scoring=metric, search_method=search, n_jobs=1, post_processing_method=EnsemblePostProcessing(ensemble_size=5)) if arff: train_path = 'tests/data/{}_train.arff'.format(data['name']) test_path = 'tests/data/{}_test.arff'.format(data['name']) X, y = data['load'](return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) y_test = [str(val) for val in y_test] with Stopwatch() as sw: gama.fit_arff(train_path, target_column=data['target']) class_predictions = gama.predict_arff(test_path, target_column=data['target']) class_probabilities = gama.predict_proba_arff( test_path, target_column=data['target']) gama_score = gama.score_arff(test_path) else: X, y = data['load'](return_X_y=True) if y_type == str: databunch = data['load']() y = np.asarray( [databunch.target_names[c_i] for c_i in databunch.target]) if y_type in [pd.Series, pd.DataFrame]: y = y_type(y) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) if missing_values: X_train[1:300:2, 0] = X_train[2:300:5, 1] = float("NaN") X_test[1:100:2, 0] = X_test[2:100:5, 1] = float("NaN") with Stopwatch() as sw: gama.fit(X_train, y_train) class_predictions = gama.predict(X_test) class_probabilities = gama.predict_proba(X_test) gama_score = gama.score(X_test, y_test) assert 60 * FIT_TIME_MARGIN > sw.elapsed_time, 'fit must stay within 110% of allotted time.' assert isinstance(class_predictions, np.ndarray), 'predictions should be numpy arrays.' assert ( data['test_size'], ) == class_predictions.shape, 'predict should return (N,) shaped array.' accuracy = accuracy_score(y_test, class_predictions) # Majority classifier on this split achieves 0.6293706293706294 print(data['name'], metric, 'accuracy:', accuracy) assert data[ 'base_accuracy'] <= accuracy, 'predictions should be at least as good as majority class.' assert isinstance( class_probabilities, np.ndarray), 'probability predictions should be numpy arrays.' assert (data['test_size'], data['n_classes']) == class_probabilities.shape, ( 'predict_proba should return' ' (N,K) shaped array.') # Majority classifier on this split achieves 12.80138131184662 logloss = log_loss(y_test, class_probabilities) print(data['name'], metric, 'log-loss:', logloss) assert data[ 'base_log_loss'] >= logloss, 'predictions should be at least as good as majority class.' score_to_match = logloss if metric == 'log_loss' else accuracy assert score_to_match == pytest.approx(gama_score)
def pset(): gc = GamaClassifier(config=clf_config, scoring="accuracy") return gc._pset
}, } # In[ ]: from sklearn.ensemble import ( RandomForestClassifier, GradientBoostingClassifier, ) #Initialization counter = 0 cls = GamaClassifier(max_total_time=3600, keep_analysis_log=None, n_jobs=1, scoring='log_loss', post_processing_method=EnsemblePostProcessing(), config=limited_config) drift_detector = EDDM() start = 1 X_train = B[start - 1].iloc[:, 0:-1] y_train = B[start - 1].iloc[:, -1] print("Starting to `fit`") cls.fit(X_train, y_train) anytime_model = cls #Prequential evaluation
def _test_dataset_problem( data, metric: str, arff: bool = False, y_type: Type = pd.DataFrame, search: BaseSearch = AsyncEA(), missing_values: bool = False, max_time: int = 60, ): """ :param data: :param metric: :param arff: :param y_type: pd.DataFrame, pd.Series, np.ndarray or str :return: """ gama = GamaClassifier( random_state=0, max_total_time=max_time, scoring=metric, search=search, n_jobs=1, post_processing=EnsemblePostProcessing(ensemble_size=5), store="nothing", ) if arff: train_path = f"tests/data/{data['name']}_train.arff" test_path = f"tests/data/{data['name']}_test.arff" X, y = data["load"](return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) y_test = [str(val) for val in y_test] with Stopwatch() as sw: gama.fit_from_file(train_path, target_column=data["target"]) class_predictions = gama.predict_from_file( test_path, target_column=data["target"]) class_probabilities = gama.predict_proba_from_file( test_path, target_column=data["target"]) gama_score = gama.score_from_file(test_path) else: X, y = data["load"](return_X_y=True) if y_type == str: databunch = data["load"]() y = np.asarray( [databunch.target_names[c_i] for c_i in databunch.target]) if y_type in [pd.Series, pd.DataFrame]: y = y_type(y) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) if missing_values: X_train[1:300:2, 0] = X_train[2:300:5, 1] = float("NaN") X_test[1:100:2, 0] = X_test[2:100:5, 1] = float("NaN") with Stopwatch() as sw: gama.fit(X_train, y_train) class_predictions = gama.predict(X_test) class_probabilities = gama.predict_proba(X_test) gama_score = gama.score(X_test, y_test) assert (60 * FIT_TIME_MARGIN > sw.elapsed_time), "fit must stay within 110% of allotted time." assert isinstance(class_predictions, np.ndarray), "predictions should be numpy arrays." assert ( data["test_size"], ) == class_predictions.shape, "predict should return (N,) shaped array." accuracy = accuracy_score(y_test, class_predictions) # Majority classifier on this split achieves 0.6293706293706294 print(data["name"], metric, "accuracy:", accuracy) assert (data["base_accuracy"] <= accuracy ), "predictions should be at least as good as majority class." assert isinstance( class_probabilities, np.ndarray), "probability predictions should be numpy arrays." assert (data["test_size"], data["n_classes"]) == class_probabilities.shape, ( "predict_proba should return" " (N,K) shaped array.") # Majority classifier on this split achieves 12.80138131184662 logloss = log_loss(y_test, class_probabilities) print(data["name"], metric, "log-loss:", logloss) assert (data["base_log_loss"] >= logloss ), "predictions should be at least as good as majority class." score_to_match = logloss if metric == "neg_log_loss" else accuracy assert score_to_match == pytest.approx(gama_score) gama.cleanup("all") return gama
from gama import GamaClassifier if __name__ == "__main__": file_path = "../tests/data/breast_cancer_{}.arff" automl = GamaClassifier(max_total_time=180, keep_analysis_log=None, n_jobs=1) print("Starting `fit` which will take roughly 3 minutes.") automl.fit_arff(file_path.format("train")) label_predictions = automl.predict_arff(file_path.format("test")) probability_predictions = automl.predict_proba_arff( file_path.format("test"))
def gama_runs(datasets, path, task): ''' Executes Gama optimization for different OpenML datasets and stores the log files in a specified path. Parameters: ----------- datasets: list Contains datasets that are going to be optimized using Gama. path: string Contains the path to the directory in where the files are logged. task: string Contains learning task to specify the GAMA optimization (either classi- fication or regression). Returns: -------- string Contains a confirmation that the optimization process has finished. ''' executed = executed_datasets(path) for dataset_id in datasets: if dataset_id not in executed: try: ds = oml.datasets.get_dataset(dataset_id, download_data=False) X, y, categorical_indicator, attribute_names = ds.get_data( dataset_format='DataFrame', target=ds.default_target_attribute) categorical, numeric, string = category_numeric_or_string( X, categorical_indicator) X, y = impute(X, y, categorical, numeric, string, "median") for k in [1, 2, 5, 10, 25]: log_k = '' if k == 1: log_k = 'a' elif k == 2: log_k = 'b' elif k == 5: log_k = 'c' elif k == 10: log_k = 'd' else: log_k = 'e' X_adj, y_adj = onehot_or_targ(X, y, categorical, k) if task.lower() == "classification": gama = GamaClassifier( n_jobs=-1, max_total_time=600, scoring='accuracy', keep_analysis_log='{}{}{}.log'.format( path, log_k, dataset_id)) elif task.lower() == "regression": gama = GamaRegressor( n_jobs=-1, max_total_time=600, scoring='r2', keep_analysis_log='{}{}{}.log'.format( path, log_k, dataset_id)) else: return "Please select classification or regression as learning task!" gama.fit(X_adj, y_adj) except: pass return "Gama has finished running optimization."
cat_vars_index.append(-1) df.iloc[:, -1].replace(0, 2, inplace=True) #Divide into equal sets of data ~20,000 samples B = np.array_split(df, n) B[0] # In[9]: #Initialization cls = GamaClassifier(max_total_time=3600, keep_analysis_log=None, n_jobs=1, scoring='log_loss', post_processing_method=EnsemblePostProcessing()) #drift_detector = ADWIN() drift_detector = EDDM() start = 1 X_train = B[start - 1].iloc[:, 0:-1] y_train = B[start - 1].iloc[:, -1] print("Starting to `fit`") cls.fit(X_train, y_train, warm_start=True) anytime_model = cls #Prequential evaluation
def main(): args = parse_args() print("CLI: Processing input") if args.input_file.lower().endswith(".csv"): raise NotImplementedError("CSV currently not supported.") # data = pd.read_csv(args.input_file, sep=args.separator) elif not os.path.exists(args.input_file.lower()): raise FileNotFoundError(args.input_file) if args.input_file.lower().endswith(".arff") and args.mode is None: # Determine the task type based on the target column in the arff file attributes = load_feature_metadata_from_arff(args.input_file) target = list(attributes)[-1] if args.target is None else args.target target_type = attributes[target] if "{" in target_type: # Nominal features are denoted by listen all their values, eg. # {VALUE_1, VALUE_2, ...} args.mode = "classification" elif target_type.lower() == "real": args.mode = "regression" else: raise ValueError( f"Target column {target} has type {target_type}, which GAMA can't model" ) print(f"Detected a {args.mode} problem.") print("CLI: Initializing GAMA") log_level = logging.INFO if args.verbose else logging.WARNING configuration = dict( regularize_length=args.prefer_short, max_total_time=args.time_limit_m * 60, max_eval_time=args.max_eval_time_m * 60, n_jobs=args.n_jobs, verbosity=log_level, keep_analysis_log=args.logpath, ) if args.metric: configuration["scoring"] = args.metric if args.mode == "regression": automl = GamaRegressor(**configuration) elif args.mode == "classification": automl = GamaClassifier(**configuration) else: raise ValueError(f"Mode {args.mode} is not valid (--mode).") if not args.dry_run: print("CLI: Starting model search") if args.input_file.lower().endswith(".arff"): automl.fit_arff(args.input_file.lower(), target_column=args.target) # else: # automl.fit(x, y) # == Model Export === print("CLI: Exporting models.") with open(args.output_file, "wb") as fh: pickle.dump(automl.model, fh) if args.export_python is not None: automl.export_script(args.export_python, raise_if_exists=False) print("done!")
from gama import GamaClassifier if __name__ == "__main__": file_path = "../tests/data/breast_cancer_{}.arff" automl = GamaClassifier(max_total_time=180, store="nothing", n_jobs=1) print("Starting `fit` which will take roughly 3 minutes.") automl.fit_from_file(file_path.format("train")) label_predictions = automl.predict_from_file(file_path.format("test")) probability_predictions = automl.predict_proba_from_file( file_path.format("test"))