Example #1
0
    def update_data_table(filename, mode):
        if filename is not None and os.path.isfile(filename):
            if mode in ["all", "small"]:
                df = file_to_pandas(filename)
                if mode == "small":
                    df = df.head(50)

                data_table = dash_table.DataTable(
                    id="table",
                    columns=[{
                        "name": c,
                        "id": c
                    } for c in df.columns],
                    data=df.to_dict("records"),
                    editable=False,
                    style_table={
                        "maxHeight": "500px",
                        "overflowY": "scroll"
                    },
                )
                attributes = list(df.columns)
            else:
                data_table = "Preview not enabled."
                attributes = list(load_feature_metadata_from_file(filename))

            target_options = [{"label": c, "value": c} for c in attributes]
            default_target = attributes[-1]

            return [data_table], target_options, default_target, False
        return ["No data loaded"], [{"label": "-", "value": "a"}], "a", True
Example #2
0
def run(dataset, config):
    log.info("\n**** GAMA [v%s] ****", __version__)
    log.info("sklearn == %s", sklearn.__version__)
    log.info("category_encoders == %s", category_encoders.__version__)

    is_classification = (config.type == 'classification')
    # Mapping of benchmark metrics to GAMA metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='neg_log_loss',
        mae='neg_mean_absolute_error',
        mse='neg_mean_squared_error',
        msle='neg_mean_squared_log_error',
        r2='r2',
        rmse='neg_mean_squared_error',
    )
    scoring_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if scoring_metric is None:
        raise ValueError("Performance metric {} not supported.".format(
            config.metric))

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config

    *_, did, fold = dataset.train_path.split('/')
    fold = fold.split('.')[0].split('_')[-1]

    log.info(
        'Running GAMA with a maximum time of %ss on %s cores, optimizing %s.',
        config.max_runtime_seconds, n_jobs, scoring_metric)

    estimator = GamaClassifier if is_classification else GamaRegressor
    kwargs = dict(n_jobs=n_jobs,
                  max_total_time=config.max_runtime_seconds,
                  scoring=scoring_metric,
                  random_state=config.seed,
                  **training_params)
    version_leq_20_2_0 = version.parse(__version__) <= version.parse('20.2.0')
    if version_leq_20_2_0:
        log_file = os.path.join(config.output_dir, "logs",
                                '{}_{}.log'.format(did, fold))
        utils.touch(log_file)
        kwargs['keep_analysis_log'] = log_file
    else:
        kwargs['max_memory_mb'] = config.max_mem_size_mb
        kwargs['output_directory'] = os.path.join(config.output_dir, "gama")

    gama_automl = estimator(**kwargs)

    data = file_to_pandas(dataset.train_path, encoding='utf-8')
    x, y = data.loc[:,
                    data.columns != dataset.target], data.loc[:,
                                                              dataset.target]

    with utils.Timer() as training_timer:
        gama_automl.fit(x, y)

    data = file_to_pandas(dataset.test_path, encoding='utf-8')
    x, _ = data.loc[:,
                    data.columns != dataset.target], data.loc[:,
                                                              dataset.target]
    log.info('Predicting on the test set.')
    with utils.Timer() as predict_timer:
        predictions = gama_automl.predict(x)
    if is_classification:
        probabilities = gama_automl.predict_proba(x)
    else:
        probabilities = None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  probabilities=probabilities,
                  target_is_encoded=False,
                  models_count=len(gama_automl._final_pop),
                  training_duration=training_timer.duration,
                  predict_duration=predict_timer.duration)
Example #3
0
 def test_file_to_pandas_arff(self):
     df = file_to_pandas(ARFF_CJS)
     _test_df_d23380(df)
Example #4
0
 def test_file_to_pandas_invalid(self):
     with pytest.raises(ValueError, match="files supported."):
         file_to_pandas("myfile.txt")
Example #5
0
 def test_file_to_pandas_csv(self):
     df = file_to_pandas(CSV_CJS_FULL)
     _test_df_d23380(df)