def load_data(dataset_info, backend, max_mem=None): try: D = backend.load_datamanager() except IOError: D = None # Datamanager probably doesn't exist if D is None: if max_mem is None: D = CompetitionDataManager(dataset_info) else: D = CompetitionDataManager(dataset_info, max_memory_in_mb=max_mem) return D
def test_with_abalone(self): dataset = 'abalone' dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', dataset) D = CompetitionDataManager(dataset_path) configuration_space = get_configuration_space( D.info, include_estimators=['extra_trees'], include_preprocessors=['no_preprocessing']) errors = [] for i in range(N_TEST_RUNS): configuration = configuration_space.sample_configuration() D_ = copy.deepcopy(D) evaluator = NestedCVEvaluator(D_, configuration, inner_cv_folds=2, outer_cv_folds=2) if not self._fit(evaluator): continue err = evaluator.predict() self.assertLess(err, 0.99) self.assertTrue(np.isfinite(err)) errors.append(err) # This is a reasonable bound self.assertEqual(10, len(errors)) self.assertLess(min(errors), 0.77)
def get_data_manager(namespace, encode_labels=False): """Get a dataset from a argparse.Namespace. Parameters ---------- namespace : argparse.Namespace The return value of ArgumentParser.parse_args() encode_labels : bool (default=False) Whether to perform 1HotEncoding for Categorical data. Returns ------- DataManager Loaded dataset. """ data_format = namespace.data_format data_format_ = data_format.lower() data_format_ = data_format_.replace('-', '').replace('_', '') if data_format_ == 'arff': return ARFFDataManager(namespace.dataset, namespace.task, namespace.metric, namespace.target, encode_labels=encode_labels) elif data_format_ == 'automlcompetitionformat': return CompetitionDataManager(namespace.dataset, encode_labels=encode_labels) else: raise NotImplemented('Data format %s not supported.' % data_format)
def evaluation(dataset, experiment_dir, run, output_path, ensemble_size=50): dataset_name = os.path.basename( dataset[:-1] if dataset.endswith("/") else dataset) try: os.makedirs(os.path.join(output_path, dataset_name)) except: pass # Set path to the predictions and load the labels path_predictions_ensemble = os.path.join(experiment_dir, ".auto-sklearn", "predictions_ensemble") path_predictions_test = os.path.join(experiment_dir, ".auto-sklearn", "predictions_valid") valid_labels = np.load( os.path.join(experiment_dir, ".auto-sklearn", "true_targets_ensemble.npy")) D = CompetitionDataManager(dataset, encode_labels=True) test_labels = D.data["Y_valid"] # Create output csv file csv_file = open( os.path.join( output_path, dataset_name, "ensemble_validationResults-traj-run-" + str(run) + "-walltime.csv"), "w") fieldnames = [ 'Time', 'Training (Empirical) Performance', 'Test Set Performance', 'AC Overhead Time', 'Validation Configuration ID' ] csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) csv_writer.writeheader() list_models = figure_indexes(path_predictions_ensemble) list_iter_models = [list_models[:j + 1] for j in range(len(list_models))] info = dict() info["task"] = D.info["task"] info["metric"] = D.info["metric"] info["label_num"] = D.info['label_num'] with open( os.path.join(experiment_dir, ".auto-sklearn", "start_time_%d" % int(run)), "r") as fh: start_time = float(fh.read()) pfunc = partial(ensemble_loop, starttime=start_time, info=info, ensemble_size=ensemble_size, test_labels=test_labels, valid_labels=valid_labels) list_to_write = Parallel(n_jobs=-1)(delayed(pfunc)(i) for i in list_iter_models) for lw in list_to_write: for l in lw: csv_writer.writerow(l)
def load_data(dataset_info, outputdir, tmp_dir=None, max_mem=None): if tmp_dir is None: tmp_dir = outputdir backend = Backend(outputdir, tmp_dir) try: D = backend.load_datamanager() except IOError: D = None # Datamanager probably doesn't exist if D is None: if max_mem is None: D = CompetitionDataManager(dataset_info, encode_labels=True) else: D = CompetitionDataManager(dataset_info, encode_labels=True, max_memory_in_mb=max_mem) return D
def store_and_or_load_data(dataset_info, outputdir): backend = Backend(None, outputdir) try: D = backend.load_datamanager() except IOError: D = None # Datamanager probably doesn't exist if D is None: D = CompetitionDataManager(dataset_info, encode_labels=True) backend.save_datamanager(D) return D
def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name self._logger = self._get_logger(name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! self._data_memory_limit = float(self._ml_memory_limit) / 3 loaded_data_manager = CompetitionDataManager( dataset, max_memory_in_mb=self._data_memory_limit) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager, metric)
def fit_automl_dataset(self, dataset): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) name = os.path.basename(dataset) self._stopwatch.start_task(name) self._start_task(self._stopwatch, name) self._dataset_name = name logger_name = 'AutoML(%d):%s' % (self._seed, name) setup_logger(os.path.join(self._tmp_dir, '%s.log' % str(logger_name))) self._logger = get_logger(logger_name) self._logger.debug('======== Reading and converting data ==========') # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = CompetitionDataManager( dataset, encode_labels=False, max_memory_in_mb=float(self._ml_memory_limit) / 3) loaded_data_manager_str = str(loaded_data_manager).split('\n') for part in loaded_data_manager_str: self._logger.debug(part) return self._fit(loaded_data_manager)
def store_and_or_load_data(dataset_info, outputdir): if dataset_info.endswith(".pkl"): save_path = dataset_info else: dataset = os.path.basename(dataset_info) data_dir = os.path.dirname(dataset_info) save_path = os.path.join(outputdir, dataset + "_Manager.pkl") if not os.path.exists(save_path): lock = lockfile.LockFile(save_path) while not lock.i_am_locking(): try: lock.acquire(timeout=60) # wait up to 60 seconds except lockfile.LockTimeout: lock.break_lock() lock.acquire() print "I locked", lock.path # It is not yet sure, whether the file already exists try: if not os.path.exists(save_path): D = CompetitionDataManager(dataset, data_dir, verbose=True, encode_labels=True) fh = open(save_path, 'w') pickle.dump(D, fh, -1) fh.close() else: D = pickle.load(open(save_path, 'r')) except: raise finally: lock.release() else: D = pickle.load(open(save_path, 'r')) return D
def get_abalone_datamanager(): dataset = 'abalone' dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', dataset) D = CompetitionDataManager(dataset_path) return D