Python GamaClassifierの例、gama.GamaClassifier Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cli.py プロジェクト: prabhant/gama

def main():
    args = parse_args()

    print("CLI: Processing input")
    if not os.path.exists(args.input_file.lower()):
        raise FileNotFoundError(args.input_file)
    if args.input_file.lower().split(".")[-1] not in ["csv", "arff"]:
        raise ValueError("Unknown file extension. Please use csv or arff.")

    kwargs = {}
    if args.input_file.lower().endswith(".csv") and args.separator is not None:
        kwargs["sep"] = args.seperator

    x, y = X_y_from_file(
        file_path=args.input_file.lower(),
        split_column=args.target,
        **kwargs,
    )
    if args.mode is None:
        if is_categorical_dtype(y.dtype):
            args.mode = "classification"
        else:
            args.mode = "regression"
        print(f"Detected a {args.mode} problem.")

    print("CLI: Initializing GAMA")
    log_level = logging.INFO if args.verbose else logging.WARNING
    configuration = dict(
        regularize_length=args.prefer_short,
        max_total_time=args.time_limit_m * 60,
        max_eval_time=args.max_eval_time_m * 60,
        n_jobs=args.n_jobs,
        verbosity=log_level,
        output_directory=args.outdir,
        store="nothing" if args.dry_run else "logs",
    )
    if args.metric:
        configuration["scoring"] = args.metric

    if args.mode == "regression":
        automl = GamaRegressor(**configuration)
    elif args.mode == "classification":
        automl = GamaClassifier(**configuration)
    else:
        raise ValueError(f"Mode {args.mode} is not valid (--mode).")

    if not args.dry_run:
        print("CLI: Starting model search")
        automl.fit(x, y)

        # == Model Export ===
        print("CLI: Exporting models.")
        with open(args.output_file, "wb") as fh:
            pickle.dump(automl.model, fh)

        if args.export_python is not None:
            automl.export_script(args.export_python, raise_if_exists=False)
    else:
        automl.cleanup("all")
    print("done!")

コード例 #2

0

ファイルを表示

def prepare_df(log_path, filename_class, filename_regr):
    ''' Executes the transformation from Gama log to df for all the logs in a
            path.

    Parameters:
    -----------
    log_path: string
        Contains name of the path where the logs are stored.
    filename_class: string
        Contains the name for the csv file of the classification tasks.
    filename_regr: string
        Contains the name for the csv file of the regression tasks.

    Returns:
    --------
    str
        Contains a confirmation that the preparation of the dataframes was executed.
    '''
    classification, regression, clustering = get_dataset_ids(10000000)
    df_class, df_regr = log_to_df(path, classification, regression)

    df_class = df_class.reset_index(drop=True)
    df_regr = df_regr.reset_index(drop=True)

    automl_class = GamaClassifier(scoring='accuracy')
    automl_regr = GamaRegressor(scoring='r2')

    df_class = children_to_components(df_class, automl_class)
    df_regr = children_to_components(df_regr, automl_regr)

    df_class.to_csv(filename_class, index=False, sep=';')
    df_regr.to_csv(filename_regr, index=False, sep=';')

    return "Prepared the dataframes."

コード例 #3

0

ファイルを表示

def test_full_system_multi_core():
    automl = GamaClassifier(
        random_state=0,
        max_total_time=60,
        max_memory_mb=4_000,
        store="nothing",
        n_jobs=2,
    )
    _gama_on_digits(automl)

コード例 #4

0

ファイルを表示

def main():
    args = parse_args()

    print('CLI: Processing input')
    if args.input_file.lower().endswith('.csv'):
        raise NotImplementedError("CSV currently not supported.")
        # data = pd.read_csv(args.input_file, sep=args.separator)
    if args.input_file.lower().endswith('.arff') and args.mode is None:
        attributes = load_feature_metadata_from_arff(args.input_file)
        target = list(attributes)[-1] if args.target is None else args.target
        target_type = attributes[target]
        if '{' in target_type:
            # Nominal features are denoted by listen all their values, eg. {VALUE_1, VALUE_2, ...}
            args.mode = 'classification'
        elif target_type.lower() == 'real':
            args.mode = 'regression'
        else:
            raise ValueError(
                f"Target column {target} has type {target_type}, which GAMA can't model."
            )

    print('CLI: Initializing GAMA')
    log_level = logging.INFO if args.verbose else logging.WARNING
    configuration = dict(regularize_length=args.prefer_short,
                         max_total_time=args.time_limit_m * 60,
                         max_eval_time=args.max_eval_time_m * 60,
                         n_jobs=args.n_jobs,
                         verbosity=log_level,
                         keep_analysis_log=args.logpath)
    if args.metric:
        configuration['scoring'] = args.metric

    if args.mode == 'regression':
        automl = GamaRegressor(**configuration)
    elif args.mode == 'classification':
        automl = GamaClassifier(**configuration)
    else:
        raise ValueError(f"Mode {args.mode} is not valid (--mode).")

    print('CLI: Starting model search')
    if args.input_file.lower().endswith('.arff'):
        automl.fit_arff(args.input_file.lower(), target_column=args.target)
    #else:
    #    automl.fit(x, y)

    # == Model Export ===
    print('CLI: Exporting models.')
    with open(args.output_file, 'wb') as fh:
        pickle.dump(automl.model, fh)

    if args.export_python is not None:
        automl.export_script(args.export_python, raise_if_exists=False)
    print('done!')

コード例 #5

0

ファイルを表示

ファイル: conftest.py プロジェクト: vumichien/gama

def opset():
    gc = GamaClassifier(config=clf_config, scoring="accuracy")
    return gc._operator_set

コード例 #6

0

ファイルを表示

ファイル: conftest.py プロジェクト: prabhant/gama

def pset():
    gc = GamaClassifier(config=clf_config, scoring="accuracy", store="nothing")
    yield gc._pset
    gc.cleanup("all")

コード例 #7

0

ファイルを表示

def gamaclassifier():
    return GamaClassifier(random_state=0, max_total_time=60)

コード例 #8

0

ファイルを表示

ファイル: GAMA_T1.py プロジェクト: openml/continual-automl

    cat_vars_index.append(-1)

df.iloc[:, -1].replace(0, 2, inplace=True)

#Divide into equal sets of data ~20,000 samples
B = np.array_split(df, n)

B[0]

# In[6]:

#Initialization

cls = GamaClassifier(max_total_time=3600,
                     keep_analysis_log=None,
                     n_jobs=1,
                     scoring='accuracy',
                     post_processing_method=EnsemblePostProcessing())

X = B[0].iloc[:, 0:-1]
y = B[0].iloc[:, -1]

print("Starting `fit`")
cls.fit(X, y)

anytime_model = cls

#Prequential evaluation

for i in range(1, n):

コード例 #9

0

ファイルを表示

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from gama import GamaClassifier

if __name__ == '__main__':
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=0)

    automl = GamaClassifier(max_total_time=180,
                            keep_analysis_log=None,
                            n_jobs=1)
    print("Starting `fit` which will take roughly 3 minutes.")
    automl.fit(X_train, y_train)

    label_predictions = automl.predict(X_test)
    probability_predictions = automl.predict_proba(X_test)

    print('accuracy:', accuracy_score(y_test, label_predictions))
    print('log loss:', log_loss(y_test, probability_predictions))

コード例 #10

0

ファイルを表示

ファイル: recommender.py プロジェクト: fabrice-toussaint/thesis

def execute_recommendations(X, y, cat_ind, recommendations, task, n_jobs=1):
    ''' Executes the recommendations made by the nearest neighbor model based on
            a learning task and sets the number of jobs to n_jobs for the estimators
            and preprocessing algorithms.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    cat_ind: list
        Contains boolean values to determine whether a column is categorical or
        not based.
    recommendations: list
        Contains the list with the recommendations made by the nearest neighbor model.
    task: str
        Contains the learning task (i.e. "classification" or "regression")
    n_jobs: int
        Contains what to set the number of jobs at for the estimators and preprocessing algorithms
            available in the recommended pipelines.

    Returns:
    --------
    list
        Contains scores of each pipeline run on X and y.
    '''
    categorical, numeric, string = category_numeric_or_string(X, cat_ind)

    if task.lower() == "classification":
        gama = GamaClassifier(scoring='accuracy')
    elif task.lower() == "regression":
        gama = GamaRegressor(scoring='r2')
    else:
        return "{} is not implemented, please try 'classification' or 'regression'".format(
            task)

    scores = []

    for recommendation in recommendations:
        pipeline, k, did = recommendation
        ind = Individual.from_string(pipeline, gama._pset)

        X_pipe = deepcopy(X)
        y_pipe = deepcopy(y)

        X_pipe, y_pipe = onehot_or_targ(X_pipe, y_pipe, categorical, k)

        pipeline = [eval(p.str_nonrecursive) for p in ind.primitives]
        pipeline.reverse()

        try:
            for component in pipeline:
                if pipeline.index(component) == len(pipeline) - 1:
                    try:
                        setattr(component, 'n_jobs', n_jobs)
                    except:
                        pass

                    X_train, X_test, y_train, y_test = train_test_split(
                        X_pipe, y_pipe, test_size=0.30, random_state=42)
                    cv_scores = cross_val_score(component,
                                                X_pipe,
                                                y_pipe,
                                                cv=10)
                    score = sum(cv_scores) / 10
                    #component.fit(X_train, y_train)
                    #score = component.score(X_test, y_test)
                    scores.append(score)
                else:
                    if isinstance(component, SelectPercentile) | isinstance(
                            component, SelectFwe):
                        X_pipe = component.fit_transform(X_pipe, y_pipe)
                    else:
                        X_pipe = component.fit_transform(X_pipe)
        except:
            scores.append(0)

    return scores

コード例 #11

0

ファイルを表示

ファイル: train_gama.py プロジェクト: jim-schwoebel/allie

def train_gama(X_train, X_test, y_train, y_test, mtype, common_name_model,
               problemtype, classes, default_featurenames, transform_model,
               settings, model_session):

    model_name = common_name_model + '.pickle'
    files = list()

    if mtype in ['c']:

        automl = GamaClassifier(max_total_time=180, keep_analysis_log=None)
        print(
            "Starting GAMA `fit` - usually takes around 3 minutes but can take longer for large datasets"
        )
        automl.fit(X_train, y_train)

        label_predictions = automl.predict(X_test)
        probability_predictions = automl.predict_proba(X_test)

        accuracy = accuracy_score(y_test, label_predictions)
        log_loss_pred = log_loss(y_test, probability_predictions)
        log_loss_score = automl.score(X_test, y_test)

        print('accuracy:', accuracy)
        print('log loss pred:', log_loss_pred)
        print('log_loss_score', log_loss_score)

    elif mtype in ['regression', 'r']:

        automl = GamaRegressor(max_total_time=180,
                               keep_analysis_log=None,
                               n_jobs=1)
        print(
            "Starting GAMA `fit` - usually takes around 3 minutes but can take longer for large datasets"
        )
        automl.fit(X_train, y_train)

        predictions = automl.predict(X_test)
        mse_error = mean_squared_error(y_test, predictions)
        print("MSE:", mse_error)

    # SAVE ML MODEL
    modelfile = open(model_name, 'wb')
    pickle.dump(automl, modelfile)
    modelfile.close()

    files.append(model_name)
    model_dir = os.getcwd()

    return model_name, model_dir, files

コード例 #12

0

ファイルを表示

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from gama import GamaClassifier

if __name__ == "__main__":
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=0)

    automl = GamaClassifier(max_total_time=180, store="nothing", n_jobs=1)
    print("Starting `fit` which will take roughly 3 minutes.")
    automl.fit(X_train, y_train)

    label_predictions = automl.predict(X_test)
    probability_predictions = automl.predict_proba(X_test)

    print("accuracy:", accuracy_score(y_test, label_predictions))
    print("log loss:", log_loss(y_test, probability_predictions))

コード例 #13

0

ファイルを表示

    --------
    pd.DataFrame
        Contains a pd.DataFrame for that specific log.
    '''
    report = GamaReport(logfile=log_file)
    return report.evaluations


if __name__ == "__main__":
    #single example:
    log_to_df_file("../data/ex3/a411.log")

    #multiple example:
    classification, regression, clustering = get_dataset_ids(10000000)
    load_path = '../data/ex3/*.log'
    filename_class = '../data/ex3/testc.csv'
    filename_regr = '../data/ex3/testr.csv'
    df_class, df_regr = log_to_df(load_path, classification, regression)

    print(df_class, df_regr)
    df_class = df_class.reset_index(drop=True)
    df_regr = df_regr.reset_index(drop=True)

    automl_regr = GamaRegressor(scoring='r2')
    automl_class = GamaClassifier(scoring='accuracy')

    children_to_components(df_class, automl_class).to_csv(filename_class,
                                                          index=False)
    children_to_components(df_regr, automl_regr).to_csv(filename_regr,
                                                        index=False)

コード例 #14

0

ファイルを表示

def _test_dataset_problem(data,
                          metric: str,
                          arff: bool = False,
                          y_type: Type = pd.DataFrame,
                          search: BaseSearch = AsyncEA(),
                          missing_values: bool = False,
                          max_time: int = 60):
    """

    :param data:
    :param metric:
    :param arff:
    :param y_type: pd.DataFrame, pd.Series, np.ndarray or str
    :return:
    """
    gama = GamaClassifier(
        random_state=0,
        max_total_time=max_time,
        scoring=metric,
        search_method=search,
        n_jobs=1,
        post_processing_method=EnsemblePostProcessing(ensemble_size=5))
    if arff:
        train_path = 'tests/data/{}_train.arff'.format(data['name'])
        test_path = 'tests/data/{}_test.arff'.format(data['name'])

        X, y = data['load'](return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            random_state=0)
        y_test = [str(val) for val in y_test]

        with Stopwatch() as sw:
            gama.fit_arff(train_path, target_column=data['target'])
        class_predictions = gama.predict_arff(test_path,
                                              target_column=data['target'])
        class_probabilities = gama.predict_proba_arff(
            test_path, target_column=data['target'])
        gama_score = gama.score_arff(test_path)
    else:
        X, y = data['load'](return_X_y=True)
        if y_type == str:
            databunch = data['load']()
            y = np.asarray(
                [databunch.target_names[c_i] for c_i in databunch.target])
        if y_type in [pd.Series, pd.DataFrame]:
            y = y_type(y)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            random_state=0)
        if missing_values:
            X_train[1:300:2, 0] = X_train[2:300:5, 1] = float("NaN")
            X_test[1:100:2, 0] = X_test[2:100:5, 1] = float("NaN")

        with Stopwatch() as sw:
            gama.fit(X_train, y_train)
        class_predictions = gama.predict(X_test)
        class_probabilities = gama.predict_proba(X_test)
        gama_score = gama.score(X_test, y_test)

    assert 60 * FIT_TIME_MARGIN > sw.elapsed_time, 'fit must stay within 110% of allotted time.'

    assert isinstance(class_predictions,
                      np.ndarray), 'predictions should be numpy arrays.'
    assert (
        data['test_size'],
    ) == class_predictions.shape, 'predict should return (N,) shaped array.'

    accuracy = accuracy_score(y_test, class_predictions)
    # Majority classifier on this split achieves 0.6293706293706294
    print(data['name'], metric, 'accuracy:', accuracy)
    assert data[
        'base_accuracy'] <= accuracy, 'predictions should be at least as good as majority class.'

    assert isinstance(
        class_probabilities,
        np.ndarray), 'probability predictions should be numpy arrays.'
    assert (data['test_size'],
            data['n_classes']) == class_probabilities.shape, (
                'predict_proba should return'
                ' (N,K) shaped array.')

    # Majority classifier on this split achieves 12.80138131184662
    logloss = log_loss(y_test, class_probabilities)
    print(data['name'], metric, 'log-loss:', logloss)
    assert data[
        'base_log_loss'] >= logloss, 'predictions should be at least as good as majority class.'

    score_to_match = logloss if metric == 'log_loss' else accuracy
    assert score_to_match == pytest.approx(gama_score)

コード例 #15

0

ファイルを表示

ファイル: conftest.py プロジェクト: vumichien/gama

def pset():
    gc = GamaClassifier(config=clf_config, scoring="accuracy")
    return gc._pset

コード例 #16

0

ファイルを表示

ファイル: GAMA_DI.py プロジェクト: openml/continual-automl

    },
}

# In[ ]:

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
)

#Initialization
counter = 0

cls = GamaClassifier(max_total_time=3600,
                     keep_analysis_log=None,
                     n_jobs=1,
                     scoring='log_loss',
                     post_processing_method=EnsemblePostProcessing(),
                     config=limited_config)

drift_detector = EDDM()

start = 1
X_train = B[start - 1].iloc[:, 0:-1]
y_train = B[start - 1].iloc[:, -1]

print("Starting to `fit`")
cls.fit(X_train, y_train)

anytime_model = cls

#Prequential evaluation

コード例 #17

0

ファイルを表示

def _test_dataset_problem(
    data,
    metric: str,
    arff: bool = False,
    y_type: Type = pd.DataFrame,
    search: BaseSearch = AsyncEA(),
    missing_values: bool = False,
    max_time: int = 60,
):
    """

    :param data:
    :param metric:
    :param arff:
    :param y_type: pd.DataFrame, pd.Series, np.ndarray or str
    :return:
    """
    gama = GamaClassifier(
        random_state=0,
        max_total_time=max_time,
        scoring=metric,
        search=search,
        n_jobs=1,
        post_processing=EnsemblePostProcessing(ensemble_size=5),
        store="nothing",
    )
    if arff:
        train_path = f"tests/data/{data['name']}_train.arff"
        test_path = f"tests/data/{data['name']}_test.arff"

        X, y = data["load"](return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            random_state=0)
        y_test = [str(val) for val in y_test]

        with Stopwatch() as sw:
            gama.fit_from_file(train_path, target_column=data["target"])
        class_predictions = gama.predict_from_file(
            test_path, target_column=data["target"])
        class_probabilities = gama.predict_proba_from_file(
            test_path, target_column=data["target"])
        gama_score = gama.score_from_file(test_path)
    else:
        X, y = data["load"](return_X_y=True)
        if y_type == str:
            databunch = data["load"]()
            y = np.asarray(
                [databunch.target_names[c_i] for c_i in databunch.target])
        if y_type in [pd.Series, pd.DataFrame]:
            y = y_type(y)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            random_state=0)
        if missing_values:
            X_train[1:300:2, 0] = X_train[2:300:5, 1] = float("NaN")
            X_test[1:100:2, 0] = X_test[2:100:5, 1] = float("NaN")

        with Stopwatch() as sw:
            gama.fit(X_train, y_train)
        class_predictions = gama.predict(X_test)
        class_probabilities = gama.predict_proba(X_test)
        gama_score = gama.score(X_test, y_test)

    assert (60 * FIT_TIME_MARGIN >
            sw.elapsed_time), "fit must stay within 110% of allotted time."

    assert isinstance(class_predictions,
                      np.ndarray), "predictions should be numpy arrays."
    assert (
        data["test_size"],
    ) == class_predictions.shape, "predict should return (N,) shaped array."

    accuracy = accuracy_score(y_test, class_predictions)
    # Majority classifier on this split achieves 0.6293706293706294
    print(data["name"], metric, "accuracy:", accuracy)
    assert (data["base_accuracy"] <= accuracy
            ), "predictions should be at least as good as majority class."

    assert isinstance(
        class_probabilities,
        np.ndarray), "probability predictions should be numpy arrays."
    assert (data["test_size"],
            data["n_classes"]) == class_probabilities.shape, (
                "predict_proba should return"
                " (N,K) shaped array.")

    # Majority classifier on this split achieves 12.80138131184662
    logloss = log_loss(y_test, class_probabilities)
    print(data["name"], metric, "log-loss:", logloss)
    assert (data["base_log_loss"] >= logloss
            ), "predictions should be at least as good as majority class."

    score_to_match = logloss if metric == "neg_log_loss" else accuracy
    assert score_to_match == pytest.approx(gama_score)
    gama.cleanup("all")
    return gama

コード例 #18

0

ファイルを表示

from gama import GamaClassifier

if __name__ == "__main__":
    file_path = "../tests/data/breast_cancer_{}.arff"

    automl = GamaClassifier(max_total_time=180,
                            keep_analysis_log=None,
                            n_jobs=1)
    print("Starting `fit` which will take roughly 3 minutes.")
    automl.fit_arff(file_path.format("train"))

    label_predictions = automl.predict_arff(file_path.format("test"))
    probability_predictions = automl.predict_proba_arff(
        file_path.format("test"))

コード例 #19

0

ファイルを表示

def gama_runs(datasets, path, task):
    ''' Executes Gama optimization for different OpenML datasets and stores the
    log files in a specified path.

    Parameters:
    -----------
    datasets: list
        Contains datasets that are going to be optimized using Gama.
    path: string
        Contains the path to the directory in where the files are logged.
    task: string
        Contains learning task to specify the GAMA optimization (either classi-
        fication or regression).

    Returns:
    --------
    string
        Contains a confirmation that the optimization process has finished.
    '''
    executed = executed_datasets(path)
    for dataset_id in datasets:
        if dataset_id not in executed:
            try:
                ds = oml.datasets.get_dataset(dataset_id, download_data=False)
                X, y, categorical_indicator, attribute_names = ds.get_data(
                    dataset_format='DataFrame',
                    target=ds.default_target_attribute)

                categorical, numeric, string = category_numeric_or_string(
                    X, categorical_indicator)
                X, y = impute(X, y, categorical, numeric, string, "median")

                for k in [1, 2, 5, 10, 25]:
                    log_k = ''
                    if k == 1:
                        log_k = 'a'
                    elif k == 2:
                        log_k = 'b'
                    elif k == 5:
                        log_k = 'c'
                    elif k == 10:
                        log_k = 'd'
                    else:
                        log_k = 'e'

                    X_adj, y_adj = onehot_or_targ(X, y, categorical, k)
                    if task.lower() == "classification":
                        gama = GamaClassifier(
                            n_jobs=-1,
                            max_total_time=600,
                            scoring='accuracy',
                            keep_analysis_log='{}{}{}.log'.format(
                                path, log_k, dataset_id))
                    elif task.lower() == "regression":
                        gama = GamaRegressor(
                            n_jobs=-1,
                            max_total_time=600,
                            scoring='r2',
                            keep_analysis_log='{}{}{}.log'.format(
                                path, log_k, dataset_id))
                    else:
                        return "Please select classification or regression as learning task!"
                    gama.fit(X_adj, y_adj)
            except:
                pass

    return "Gama has finished running optimization."

コード例 #20

0

ファイルを表示

    cat_vars_index.append(-1)

df.iloc[:, -1].replace(0, 2, inplace=True)

#Divide into equal sets of data ~20,000 samples
B = np.array_split(df, n)

B[0]

# In[9]:

#Initialization

cls = GamaClassifier(max_total_time=3600,
                     keep_analysis_log=None,
                     n_jobs=1,
                     scoring='log_loss',
                     post_processing_method=EnsemblePostProcessing())
#drift_detector = ADWIN()
drift_detector = EDDM()

start = 1
X_train = B[start - 1].iloc[:, 0:-1]
y_train = B[start - 1].iloc[:, -1]

print("Starting to `fit`")
cls.fit(X_train, y_train, warm_start=True)

anytime_model = cls

#Prequential evaluation

コード例 #21

0

ファイルを表示

def main():
    args = parse_args()

    print("CLI: Processing input")
    if args.input_file.lower().endswith(".csv"):
        raise NotImplementedError("CSV currently not supported.")
        # data = pd.read_csv(args.input_file, sep=args.separator)
    elif not os.path.exists(args.input_file.lower()):
        raise FileNotFoundError(args.input_file)

    if args.input_file.lower().endswith(".arff") and args.mode is None:
        # Determine the task type based on the target column in the arff file
        attributes = load_feature_metadata_from_arff(args.input_file)
        target = list(attributes)[-1] if args.target is None else args.target
        target_type = attributes[target]
        if "{" in target_type:
            # Nominal features are denoted by listen all their values, eg.
            # {VALUE_1, VALUE_2, ...}
            args.mode = "classification"
        elif target_type.lower() == "real":
            args.mode = "regression"
        else:
            raise ValueError(
                f"Target column {target} has type {target_type}, which GAMA can't model"
            )
        print(f"Detected a {args.mode} problem.")

    print("CLI: Initializing GAMA")
    log_level = logging.INFO if args.verbose else logging.WARNING
    configuration = dict(
        regularize_length=args.prefer_short,
        max_total_time=args.time_limit_m * 60,
        max_eval_time=args.max_eval_time_m * 60,
        n_jobs=args.n_jobs,
        verbosity=log_level,
        keep_analysis_log=args.logpath,
    )
    if args.metric:
        configuration["scoring"] = args.metric

    if args.mode == "regression":
        automl = GamaRegressor(**configuration)
    elif args.mode == "classification":
        automl = GamaClassifier(**configuration)
    else:
        raise ValueError(f"Mode {args.mode} is not valid (--mode).")

    if not args.dry_run:
        print("CLI: Starting model search")
        if args.input_file.lower().endswith(".arff"):
            automl.fit_arff(args.input_file.lower(), target_column=args.target)
        # else:
        #    automl.fit(x, y)

        # == Model Export ===
        print("CLI: Exporting models.")
        with open(args.output_file, "wb") as fh:
            pickle.dump(automl.model, fh)

        if args.export_python is not None:
            automl.export_script(args.export_python, raise_if_exists=False)
    print("done!")

コード例 #22

0

ファイルを表示

ファイル: arff_example.py プロジェクト: prabhant/gama

from gama import GamaClassifier

if __name__ == "__main__":
    file_path = "../tests/data/breast_cancer_{}.arff"

    automl = GamaClassifier(max_total_time=180, store="nothing", n_jobs=1)
    print("Starting `fit` which will take roughly 3 minutes.")
    automl.fit_from_file(file_path.format("train"))

    label_predictions = automl.predict_from_file(file_path.format("test"))
    probability_predictions = automl.predict_proba_from_file(
        file_path.format("test"))