Esempio n. 1
0
def main(working_directory, time_limit, per_run_time_limit, task_id, seed):
    configuration_output_dir = os.path.join(working_directory, str(seed))
    try:
        os.makedirs(configuration_output_dir)
    except Exception as _:
        print(
            "Direcotry {0} aleardy created.".format(configuration_output_dir))

    tmp_dir = os.path.join(configuration_output_dir, str(task_id))
    #try:
    #    os.makedirs(tmp_dir)
    #except Exception as _:
    #    print("Direcotry {0} aleardy created.".format(configuration_output_dir))

    automl_arguments = {
        'time_left_for_this_task': time_limit,
        'per_run_time_limit': per_run_time_limit,
        'initial_configurations_via_metalearning': 0,
        'ensemble_size': 0,
        'seed': seed,
        'ml_memory_limit': 3072,
        'resampling_strategy': 'holdout',
        'resampling_strategy_arguments': {
            'train_size': 0.67
        },
        #'resampling_strategy': 'cv',
        #'resampling_strategy_arguments': {'folds': 5},
        'tmp_folder': tmp_dir,
        'delete_tmp_folder_after_terminate': False,
        'disable_evaluator_output': False,
    }

    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    automl = AutoSklearnClassifier(**automl_arguments)

    automl.fit(X_train,
               y_train,
               dataset_name=str(task_id),
               X_test=X_test,
               y_test=y_test,
               metric=balanced_accuracy)

    with open(os.path.join(tmp_dir, "score_vanilla.csv"), 'w') as fh:
        T = 0
        fh.write("Time,Train Performance,Test Performance\n")
        # Add start time:0, Train Performance:1, Test Performance: 1
        best_loss = 1
        fh.write("{0},{1},{2}\n".format(T, 0, 0))
        for key, value in automl._automl.runhistory_.data.items(
        ):  # We compute rank based on error.
            t = value.time
            loss = value.cost
            T += t

            if loss < best_loss:
                fh.write("{0},{1},{2}\n".format(
                    T, 1 - loss,
                    1 - value.additional_info.get('test_loss', 1.0)))
                best_loss = loss
Esempio n. 2
0
def calculate_metafeatures(task_id):
    X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(task_id)
    watch = StopWatch()

    if task_type == 'classification':
        if len(np.unique(y_train)) == 2:
            task_type = BINARY_CLASSIFICATION
        else:
            task_type = MULTICLASS_CLASSIFICATION
    else:
        task_type = REGRESSION

    _metafeatures_labels = _calculate_metafeatures(
        x_train=X_train, y_train=y_train, data_feat_type=cat,
        data_info_task=task_type, basename=dataset_name, logger_=logger,
        watcher=watch,
    )

    _metafeatures_encoded_labels = _calculate_metafeatures_encoded(
        x_train=X_train, y_train=y_train, data_feat_type=cat,
        task=task_type, basename=dataset_name, logger_=logger,
        watcher=watch,
    )

    mf = _metafeatures_labels
    mf.metafeature_values.update(
        _metafeatures_encoded_labels.metafeature_values)

    return mf
def calculate_metafeatures(task_id):
    print(task_id)
    X_train, y_train, X_test, y_test, cat = load_task(task_id)
    categorical = [True if 'categorical' == c else False for c in cat]

    _metafeatures_labels = metafeatures.calculate_all_metafeatures_with_labels(
        X_train, y_train, [False] * X_train.shape[1], task_id)

    X_train, sparse = perform_one_hot_encoding(scipy.sparse.issparse(X_train),
                                               categorical, [X_train])
    X_train = X_train[0]
    categorical = [False] * X_train.shape[1]

    start_time = time.time()
    obj = pynisher.enforce_limits(mem_in_mb=3072)(
        metafeatures.calculate_all_metafeatures_encoded_labels)
    _metafeatures_encoded_labels = obj(X_train, y_train, categorical, task_id)
    end_time = time.time()

    if obj.exit_status == pynisher.MemorylimitException:
        # During the conversion of the dataset (rescaling, etc...), it can
        # happen that we run out of memory.
        _metafeatures_encoded_labels = \
            metafeature.DatasetMetafeatures(task_id, dict())

        metafeature_calculation_time = (end_time - start_time) / \
                                       len(metafeatures.npy_metafeatures)

        for metafeature_name in metafeatures.npy_metafeatures:
            type_ = "HELPERFUNCTION" if metafeature_name not in \
                                        metafeatures.metafeatures.functions \
                else "METAFEATURE"
            _metafeatures_encoded_labels.metafeature_values[metafeature_name] = \
                metafeature.MetaFeatureValue(metafeature_name, type_, 0, 0,
                                             np.NaN, metafeature_calculation_time,
                                             "Memory error during dataset scaling.")

    mf = _metafeatures_labels
    mf.metafeature_values.update(
        _metafeatures_encoded_labels.metafeature_values)

    return mf
Esempio n. 4
0
    parser.add_argument('--per-run-time-limit', type=int, required=True)
    parser.add_argument('--task-id', type=int, required=True)
    parser.add_argument('--metric', type=str, required=True)
    parser.add_argument('-s', '--seed', type=int, required=True)
    parser.add_argument('--unittest', action='store_true')
    args = parser.parse_args()

    working_directory = args.working_directory
    time_limit = args.time_limit
    per_run_time_limit = args.per_run_time_limit
    task_id = args.task_id
    seed = args.seed
    metric = args.metric
    is_test = args.unittest

    X_train, y_train, X_test, y_test, cat, task_type, dataset_name = load_task(
        task_id)

    configuration_output_dir = os.path.join(working_directory, 'configuration',
                                            task_type)
    os.makedirs(configuration_output_dir, exist_ok=True)
    tmp_dir = os.path.join(configuration_output_dir, str(task_id), metric)
    os.makedirs(tmp_dir, exist_ok=True)

    tempdir = tempfile.mkdtemp()
    autosklearn_directory = os.path.join(tempdir, "dir")

    automl_arguments = {
        'time_left_for_this_task': time_limit,
        'per_run_time_limit': per_run_time_limit,
        'initial_configurations_via_metalearning': 0,
        'ensemble_size': 0,
Esempio n. 5
0
from mosaic_ml.automl import AutoML
from update_metadata_util import load_task

X_train, y_train, X_test, y_test, cat = load_task(252)

autoML = AutoML(time_budget=120,
                time_limit_for_evaluation=100,
                memory_limit=3024,
                seed=1,
                scoring_func="balanced_accuracy",
                exec_dir="execution_dir",
                verbose=True)

best_config, best_score = autoML.fit(X_train,
                                     y_train,
                                     X_test,
                                     y_test,
                                     categorical_features=cat)
print("Best config {0}\t Result:{1}".format(best_config, best_score))
Esempio n. 6
0
    'per_run_time_limit': per_run_time_limit,
    'initial_configurations_via_metalearning': 0,
    'ensemble_size': 0,
    'ensemble_nbest': 0,
    'seed': seed,
    'ml_memory_limit': 3072,
    'resampling_strategy': 'partial-cv',
    'resampling_strategy_arguments': {
        'folds': 10
    },
    'delete_tmp_folder_after_terminate': False,
    'tmp_folder': tmp_dir,
    'disable_evaluator_output': True,
}

X_train, y_train, X_test, y_test, cat = load_task(task_id)

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
    metric = balanced_accuracy
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
    metric = r2
else:
    raise ValueError(task_type)

automl.fit(X_train,
           y_train,
           dataset_name=str(task_id),
           metric=metric,
           feat_type=cat)
parser.add_argument('--per-run-time-limit', type=int, required=True)
parser.add_argument('--task-id', type=int, required=True)
parser.add_argument('--metric', type=str, required=True)
parser.add_argument('-s', '--seed', type=int, required=True)
parser.add_argument('--unittest', action='store_true')
args = parser.parse_args()

working_directory = args.working_directory
time_limit = args.time_limit
per_run_time_limit = args.per_run_time_limit
task_id = args.task_id
seed = args.seed
metric = args.metric
is_test = args.unittest

X_train, y_train, X_test, y_test, cat, task_type = load_task(task_id)

configuration_output_dir = os.path.join(working_directory, 'configuration',
                                        task_type)
os.makedirs(configuration_output_dir, exist_ok=True)
tmp_dir = os.path.join(configuration_output_dir, str(task_id), metric)
os.makedirs(tmp_dir, exist_ok=True)

tempdir = tempfile.mkdtemp()
autosklearn_directory = os.path.join(tempdir, "dir")

automl_arguments = {
    'time_left_for_this_task': time_limit,
    'per_run_time_limit': per_run_time_limit,
    'initial_configurations_via_metalearning': 0,
    'ensemble_size': 0,
Esempio n. 8
0
def main(working_directory, time_limit, per_run_time_limit, task_id, seed):
    # Load data and other info.
    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    # path to the metadata directory. Is there ar better way to get this?
    metadata_directory = os.path.abspath(os.path.dirname(__file__))
    metadata_directory = os.path.join(
        metadata_directory,
        "/home/tau/hrakotoa/Code/reproduce/auto-sklearn/auto-sklearn/autosklearn/metalearning/files/"
    )
    #metadata_directory = os.path.dirname(autosklearn.metalearning.files.__file__)

    # Create new metadata directory not containing task_id.
    new_metadata_directory = os.path.abspath(
        os.path.join(working_directory, "metadata_%i" % task_id))

    try:
        os.makedirs(new_metadata_directory)
        remove_dataset(metadata_directory, new_metadata_directory, task_id)
    except:
        pass  # pass because new metadata is created for this task.

    # We need to get task type, metric, is_sparse_or_dense information to
    # construct the path to the specific metadata directory. For details see
    # get_metalearning_suggestion() in smbo.py.
    TASK_TYPES_TO_STRING = {  # Mimic the same dict in autosklearn.constants
        'binary': 'binary.classification',
        'multiclass': 'multiclass.classification',
    }
    task_type = type_of_target(y_train)
    metadata_for_this_task = os.path.abspath(
        os.path.join(
            working_directory, "metadata_%i/balanced_accuracy_%s_sparse" %
            (task_id, TASK_TYPES_TO_STRING[task_type])))
    # how to check if data is sparse before running?

    configuration_output_dir = os.path.join(working_directory, str(seed))
    tmp_dir = os.path.join(configuration_output_dir, str(task_id))
    try:
        if not os.path.exists(configuration_output_dir):
            os.makedirs(configuration_output_dir)
    except Exception as _:
        print(
            "Direcotry {0} aleardy created.".format(configuration_output_dir))

    automl_arguments = {
        'time_left_for_this_task': time_limit,
        'per_run_time_limit': per_run_time_limit,
        'initial_configurations_via_metalearning': 25,
        'ensemble_size': 0,
        'seed': seed,
        'ml_memory_limit': 3072,
        'resampling_strategy': 'holdout',
        'resampling_strategy_arguments': {
            'train_size': 0.67
        },
        'tmp_folder': tmp_dir,
        'delete_tmp_folder_after_terminate': False,
        'disable_evaluator_output': False,
    }

    automl = AutoSklearnClassifier(**automl_arguments)
    # automl._automl._metadata_directory does not work cause clf._automl is not
    # created until fit is called. Therefore, we need to manually create
    # automl._automl and specify metadata_directory there.
    automl._automl = automl.build_automl()
    automl._automl._metadata_directory = metadata_for_this_task

    # Fit.
    automl._automl.fit(
        X_train,
        y_train,
        dataset_name=str(task_id),
        X_test=X_test,
        y_test=y_test,
        metric=balanced_accuracy,
    )

    with open(os.path.join(tmp_dir, "score_metalearning.csv"), 'w') as fh:
        T = 0
        fh.write("Time,Train Performance,Test Performance\n")
        # Add start time:0, Train Performance:1, Test Performance: 1
        best_loss = 1
        fh.write("{0},{1},{2}\n".format(T, 0, 0))
        for key, value in automl._automl.runhistory_.data.items():
            t = value.time
            loss = value.cost
            T += t

            if loss < best_loss:
                fh.write("{0},{1},{2}\n".format(
                    T, 1 - loss,
                    1 - value.additional_info.get('test_loss', 1.0)))
                best_loss = loss
automl_arguments = {
    'time_left_for_this_task': time_limit,
    'per_run_time_limit': per_run_time_limit,
    'initial_configurations_via_metalearning': 0,
    'ensemble_size': 0,
    'ensemble_nbest': 0,
    'seed': seed,
    'ml_memory_limit': 3072,
    'resampling_strategy': 'partial-cv',
    'resampling_strategy_arguments': {'folds': 10},
    'delete_tmp_folder_after_terminate': False,
    'tmp_folder': tmp_dir,
    'disable_evaluator_output': True,
}

X_train, y_train, X_test, y_test, cat = load_task(task_id)

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
    metric = balanced_accuracy
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
    metric = r2
else:
    raise ValueError(task_type)

automl.fit(X_train, y_train, dataset_name=str(task_id), metric=metric,
           feat_type=cat)
data = automl._automl._backend.load_datamanager()
# Data manager can't be replaced with save_datamanager, it has to be deleted
# first