Ejemplo n.º 1
0
def load_data(dataset_info, backend, max_mem=None):
    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        if max_mem is None:
            D = CompetitionDataManager(dataset_info)
        else:
            D = CompetitionDataManager(dataset_info, max_memory_in_mb=max_mem)
    return D
Ejemplo n.º 2
0
    def test_with_abalone(self):
        dataset = 'abalone'
        dataset_path = os.path.join(os.path.dirname(__file__), '.datasets',
                                    dataset)
        D = CompetitionDataManager(dataset_path)
        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        errors = []
        for i in range(N_TEST_RUNS):
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_,
                                          configuration,
                                          inner_cv_folds=2,
                                          outer_cv_folds=2)
            if not self._fit(evaluator):
                continue
            err = evaluator.predict()
            self.assertLess(err, 0.99)
            self.assertTrue(np.isfinite(err))
            errors.append(err)
        # This is a reasonable bound
        self.assertEqual(10, len(errors))
        self.assertLess(min(errors), 0.77)
Ejemplo n.º 3
0
def get_data_manager(namespace, encode_labels=False):
    """Get a dataset from a argparse.Namespace.

    Parameters
    ----------
    namespace : argparse.Namespace
        The return value of ArgumentParser.parse_args()

    encode_labels : bool (default=False)
        Whether to perform 1HotEncoding for Categorical data.

    Returns
    -------
    DataManager
        Loaded dataset.
    """
    data_format = namespace.data_format
    data_format_ = data_format.lower()
    data_format_ = data_format_.replace('-', '').replace('_', '')

    if data_format_ == 'arff':
        return ARFFDataManager(namespace.dataset, namespace.task,
                               namespace.metric, namespace.target,
                               encode_labels=encode_labels)
    elif data_format_ == 'automlcompetitionformat':
        return CompetitionDataManager(namespace.dataset,
                                      encode_labels=encode_labels)
    else:
        raise NotImplemented('Data format %s not supported.' % data_format)
def evaluation(dataset, experiment_dir, run, output_path, ensemble_size=50):

    dataset_name = os.path.basename(
        dataset[:-1] if dataset.endswith("/") else dataset)
    try:
        os.makedirs(os.path.join(output_path, dataset_name))
    except:
        pass

    # Set path to the predictions and load the labels
    path_predictions_ensemble = os.path.join(experiment_dir, ".auto-sklearn",
                                             "predictions_ensemble")
    path_predictions_test = os.path.join(experiment_dir, ".auto-sklearn",
                                         "predictions_valid")

    valid_labels = np.load(
        os.path.join(experiment_dir, ".auto-sklearn",
                     "true_targets_ensemble.npy"))

    D = CompetitionDataManager(dataset, encode_labels=True)
    test_labels = D.data["Y_valid"]

    # Create output csv file
    csv_file = open(
        os.path.join(
            output_path, dataset_name, "ensemble_validationResults-traj-run-" +
            str(run) + "-walltime.csv"), "w")
    fieldnames = [
        'Time', 'Training (Empirical) Performance', 'Test Set Performance',
        'AC Overhead Time', 'Validation Configuration ID'
    ]
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()

    list_models = figure_indexes(path_predictions_ensemble)
    list_iter_models = [list_models[:j + 1] for j in range(len(list_models))]

    info = dict()
    info["task"] = D.info["task"]
    info["metric"] = D.info["metric"]
    info["label_num"] = D.info['label_num']

    with open(
            os.path.join(experiment_dir, ".auto-sklearn",
                         "start_time_%d" % int(run)), "r") as fh:
        start_time = float(fh.read())

    pfunc = partial(ensemble_loop,
                    starttime=start_time,
                    info=info,
                    ensemble_size=ensemble_size,
                    test_labels=test_labels,
                    valid_labels=valid_labels)

    list_to_write = Parallel(n_jobs=-1)(delayed(pfunc)(i)
                                        for i in list_iter_models)
    for lw in list_to_write:
        for l in lw:
            csv_writer.writerow(l)
Ejemplo n.º 5
0
def load_data(dataset_info, outputdir, tmp_dir=None, max_mem=None):
    if tmp_dir is None:
        tmp_dir = outputdir
    backend = Backend(outputdir, tmp_dir)
    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        if max_mem is None:
            D = CompetitionDataManager(dataset_info, encode_labels=True)
        else:
            D = CompetitionDataManager(dataset_info,
                                       encode_labels=True,
                                       max_memory_in_mb=max_mem)
    return D
Ejemplo n.º 6
0
def store_and_or_load_data(dataset_info, outputdir):
    backend = Backend(None, outputdir)

    try:
        D = backend.load_datamanager()
    except IOError:
        D = None

    # Datamanager probably doesn't exist
    if D is None:
        D = CompetitionDataManager(dataset_info, encode_labels=True)
        backend.save_datamanager(D)

    return D
Ejemplo n.º 7
0
    def fit_automl_dataset(self, dataset, metric):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        self._logger = self._get_logger(name)
        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        self._data_memory_limit = float(self._ml_memory_limit) / 3
        loaded_data_manager = CompetitionDataManager(
            dataset, max_memory_in_mb=self._data_memory_limit)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager, metric)
Ejemplo n.º 8
0
    def fit_automl_dataset(self, dataset):
        self._stopwatch = StopWatch()
        self._backend.save_start_time(self._seed)

        name = os.path.basename(dataset)
        self._stopwatch.start_task(name)
        self._start_task(self._stopwatch, name)
        self._dataset_name = name

        logger_name = 'AutoML(%d):%s' % (self._seed, name)
        setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name)))
        self._logger = get_logger(logger_name)

        self._logger.debug('======== Reading and converting data ==========')
        # Encoding the labels will be done after the metafeature calculation!
        loaded_data_manager = CompetitionDataManager(
            dataset, encode_labels=False,
            max_memory_in_mb=float(self._ml_memory_limit) / 3)
        loaded_data_manager_str = str(loaded_data_manager).split('\n')
        for part in loaded_data_manager_str:
            self._logger.debug(part)

        return self._fit(loaded_data_manager)
Ejemplo n.º 9
0
def store_and_or_load_data(dataset_info, outputdir):
    if dataset_info.endswith(".pkl"):
        save_path = dataset_info
    else:
        dataset = os.path.basename(dataset_info)
        data_dir = os.path.dirname(dataset_info)
        save_path = os.path.join(outputdir, dataset + "_Manager.pkl")

    if not os.path.exists(save_path):
        lock = lockfile.LockFile(save_path)
        while not lock.i_am_locking():
            try:
                lock.acquire(timeout=60)  # wait up to 60 seconds
            except lockfile.LockTimeout:
                lock.break_lock()
                lock.acquire()
        print "I locked", lock.path
        # It is not yet sure, whether the file already exists
        try:
            if not os.path.exists(save_path):
                D = CompetitionDataManager(dataset,
                                           data_dir,
                                           verbose=True,
                                           encode_labels=True)
                fh = open(save_path, 'w')
                pickle.dump(D, fh, -1)
                fh.close()
            else:
                D = pickle.load(open(save_path, 'r'))
        except:
            raise
        finally:
            lock.release()
    else:
        D = pickle.load(open(save_path, 'r'))
    return D
Ejemplo n.º 10
0
def get_abalone_datamanager():
    dataset = 'abalone'
    dataset_path = os.path.join(os.path.dirname(__file__), '.datasets',
                                dataset)
    D = CompetitionDataManager(dataset_path)
    return D