def test_split_classification_many_imbalanced_classes(self):
     for i in range(10):
         X = np.array([range(20), range(20)]).transpose()
         y = np.array(
             (0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5))
         np.random.shuffle(y)
         X_train, X_valid, Y_train, Y_valid = split_data(
             X, y, classification=True)
         print X_train, Y_train
         self.assertLessEqual(max(Y_valid), 1)
 def test_split_classification_many_imbalanced_classes(self):
     for i in range(10):
         X = np.array([range(20), range(20)]).transpose()
         y = np.array((0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
                       4, 5))
         np.random.shuffle(y)
         X_train, X_valid, Y_train, Y_valid = split_data(X, y,
                                                         classification=True)
         print X_train, Y_train
         self.assertLessEqual(max(Y_valid), 1)
    def _split_regular(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 1, 1, 2])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape, (4, 2))
        self.assertEqual(Y_train.shape, (4, ))
        self.assertEqual(X_valid.shape, (2, 2))
        self.assertEqual(Y_valid.shape, (2, ))

        self.assertListEqual(list(Y_valid), [0, 0])
        self.assertListEqual(list(Y_train), [2, 0, 1, 1])
    def _split_regular(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 1, 1, 2])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape, (4, 2))
        self.assertEqual(Y_train.shape, (4, ))
        self.assertEqual(X_valid.shape, (2, 2))
        self.assertEqual(Y_valid.shape, (2, ))

        self.assertListEqual(list(Y_valid), [0, 0])
        self.assertListEqual(list(Y_train), [2, 0, 1, 1])
    def _stratify(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 0, 1, 1])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape[0], 4)
        self.assertEqual(X_train.shape[1], 2)
        self.assertEqual(Y_train.shape[0], 4)

        self.assertEqual(X_valid.shape[0], 2)
        self.assertEqual(X_valid.shape[1], 2)
        self.assertEqual(Y_valid.shape[0], 2)

        self.assertListEqual(list(Y_valid), [1, 0])
        self.assertListEqual(list(Y_train), [0, 0, 0, 1])
    def _stratify(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 0, 1, 1])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape[0], 4)
        self.assertEqual(X_train.shape[1], 2)
        self.assertEqual(Y_train.shape[0], 4)

        self.assertEqual(X_valid.shape[0], 2)
        self.assertEqual(X_valid.shape[1], 2)
        self.assertEqual(Y_valid.shape[0], 2)

        self.assertListEqual(list(Y_valid), [1, 0])
        self.assertListEqual(list(Y_train), [0, 0, 0, 1])
    def __init__(self, Datamanager, configuration, with_predictions=False,
                 all_scoring_functions=False, seed=1, output_dir=None,
                 output_y_test=False, num_run=None):
        super(HoldoutEvaluator, self).__init__(Datamanager, configuration,
            with_predictions=with_predictions,
            all_scoring_functions=all_scoring_functions,
            seed=seed, output_dir=output_dir,
            output_y_test=output_y_test,
            num_run=num_run)

        classification =  Datamanager.info['task'] in CLASSIFICATION_TASKS
        self.X_train, self.X_optimization, self.Y_train, self.Y_optimization = \
            split_data(Datamanager.data['X_train'],
                       Datamanager.data['Y_train'],
                       classification=classification)

        self.model = self.model_class(self.configuration, self.seed)
    def test_split_data_regression(self):
        n_points = 1000
        np.random.seed(42)
        n_dims = np.random.randint(1, 100)
        X = np.random.rand(n_points, n_dims)
        y = np.random.rand(n_points)

        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        self.assertEqual(X_train.shape[0], 670)
        self.assertEqual(X_valid.shape[0], 330)
        self.assertEqual(Y_train.shape[0], 670)
        self.assertEqual(Y_valid.shape[0], 330)
        self.assertEqual(X_train.shape[1], n_dims)
        self.assertEqual(X_valid.shape[1], n_dims)

        # Random checks
        self.assertAlmostEqual(X_train[4, 2], 0.5986584841970366)
        self.assertAlmostEqual(X_valid[4, 2], 0.63911512838980322)
    def test_split_data_regression(self):
        n_points = 1000
        np.random.seed(42)
        n_dims = np.random.randint(1, 100)
        X = np.random.rand(n_points, n_dims)
        y = np.random.rand(n_points)

        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        self.assertEqual(X_train.shape[0], 670)
        self.assertEqual(X_valid.shape[0], 330)
        self.assertEqual(Y_train.shape[0], 670)
        self.assertEqual(Y_valid.shape[0], 330)
        self.assertEqual(X_train.shape[1], n_dims)
        self.assertEqual(X_valid.shape[1], n_dims)

        # Random checks
        self.assertAlmostEqual(X_train[4, 2], 0.5986584841970366)
        self.assertAlmostEqual(X_valid[4, 2], 0.63911512838980322)
Beispiel #10
0
    def _fit(self, D):
        # TODO: check that data and task definition fit together!

        self.metric_ = D.info['metric']
        self.task_ = D.info['task']
        self.target_num_ = D.info['target_num']

        # Set environment variable:
        seed = os.environ.get("AUTOSKLEARN_SEED")
        if seed is not None and int(seed) != self.seed:
            raise ValueError("It seems you have already started an instance "
                             "of AutoSklearn in this thread.")
        else:
            os.environ["AUTOSKLEARN_SEED"] = str(self.seed)

        # == Split dataset and store Data for the ensemble script
        X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data(
            D.data['X_train'], D.data['Y_train'])

        true_labels_ensemble_filename = os.path.join(self.tmp_dir,
                                                     "true_labels_ensemble.npy")
        true_labels_ensemble_lock = true_labels_ensemble_filename + ".lock"
        with lockfile.LockFile(true_labels_ensemble_lock):
            if not os.path.exists(true_labels_ensemble_filename):
                np.save(true_labels_ensemble_filename, Y_ensemble)

        del X_train, X_ensemble, Y_train, Y_ensemble

        time_needed_to_load_data = self.stopwatch_.wall_elapsed(self.basename_)
        time_left_after_reading = max(0, self.time_left_for_this_task -
                                      time_needed_to_load_data)
        self.logger.info("Remaining time after reading %s %5.2f sec" %
                    (self.basename_, time_left_after_reading))

        self.stopwatch_.stop_task("LoadData")

        # == Calculate metafeatures
        self.stopwatch_.start_task("CalculateMetafeatures")
        categorical = [True if feat_type.lower() in ["categorical"] else False
                       for feat_type in D.feat_type]

        if self.initial_configurations_via_metalearning <= 0:
            ml = None
        elif D.info["task"] in \
                [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]:
            ml = metalearning.MetaLearning()
            self.logger.debug("Start calculating metafeatures for %s" %
                              self.basename_)
            ml.calculate_metafeatures_with_labels(D.data["X_train"],
                                                  D.data["Y_train"],
                                                  categorical=categorical,
                                                  dataset_name=self.basename_)
        else:
            ml = None
            self.logger.critical("Metafeatures not calculated")
        self.stopwatch_.stop_task("CalculateMetafeatures")
        self.logger.debug("Calculating Metafeatures (categorical attributes) took %5.2f" % self.stopwatch_.wall_elapsed("CalculateMetafeatures"))

        self.stopwatch_.start_task("OneHot")
        D.perform1HotEncoding()
        self.ohe_ = D.encoder_
        self.stopwatch_.stop_task("OneHot")

        # == Pickle the data manager
        self.stopwatch_.start_task("StoreDatamanager")
        data_manager_path = os.path.join(self.tmp_dir,
                                         self.basename_ + "_Manager.pkl")
        data_manager_lockfile = data_manager_path + ".lock"
        with lockfile.LockFile(data_manager_lockfile):
            if not os.path.exists(data_manager_path):
                pickle.dump(D,
                            open(data_manager_path, 'w'), protocol=-1)
                self.logger.debug("Pickled Datamanager at %s" %
                                  data_manager_path)
            else:
                self.logger.debug("Data manager already presend at %s" %
                                  data_manager_path)
        self.stopwatch_.stop_task("StoreDatamanager")

        # = Create a searchspace
        self.stopwatch_.start_task("CreateConfigSpace")
        configspace_path = os.path.join(self.tmp_dir, "space.pcs")
        self.configuration_space = paramsklearn.get_configuration_space(
            D.info)

        self.configuration_space_created_hook()

        sp_string = pcs_parser.write(self.configuration_space)
        configuration_space_lockfile = configspace_path + ".lock"
        with lockfile.LockFile(configuration_space_lockfile):
            if not os.path.exists(configspace_path):
                with open(configspace_path, "w") as fh:
                    fh.write(sp_string)
                self.logger.debug("Configuration space written to %s" %
                                  configspace_path)
            else:
                self.logger.debug("Configuration space already present at %s" %
                                  configspace_path)
        self.stopwatch_.stop_task("CreateConfigSpace")

        if ml is None:
            initial_configurations = []
        elif D.info["task"]in \
                [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]:
            self.stopwatch_.start_task("CalculateMetafeaturesEncoded")
            ml.calculate_metafeatures_encoded_labels(X_train=D.data["X_train"],
                                                     Y_train=D.data["Y_train"],
                                                     categorical=[False] * D.data["X_train"].shape[0],
                                                     dataset_name=self.basename_)
            self.stopwatch_.stop_task("CalculateMetafeaturesEncoded")
            self.logger.debug(
                "Calculating Metafeatures (encoded attributes) took %5.2fsec" %
                self.stopwatch_.wall_elapsed("CalculateMetafeaturesEncoded"))

            self.logger.debug(ml._metafeatures_labels.__repr__(verbosity=2))
            self.logger.debug(ml._metafeatures_encoded_labels.__repr__(verbosity=2))

            self.stopwatch_.start_task("InitialConfigurations")
            try:
                initial_configurations = ml.create_metalearning_string_for_smac_call(
                    self.configuration_space, self.basename_, self.metric_,
                    self.task_, True if D.info['is_sparse'] == 1 else False,
                    self.initial_configurations_via_metalearning, self.metadata_directory)
            except Exception as e:
                import traceback

                self.logger.error(str(e))
                self.logger.error(traceback.format_exc())
                initial_configurations = []

            self.stopwatch_.stop_task("InitialConfigurations")

            self.logger.debug("Initial Configurations: (%d)", len(initial_configurations))
            for initial_configuration in initial_configurations:
                self.logger.debug(initial_configuration)
            self.logger.debug("Looking for initial configurations took %5.2fsec" %
                              self.stopwatch_.wall_elapsed("InitialConfigurations"))
            self.logger.info(
                "Time left for %s after finding initial configurations: %5.2fsec" %
                (self.basename_, self.time_left_for_this_task -
                 self.stopwatch_.wall_elapsed(self.basename_)))
        else:
            initial_configurations = []
            self.logger.critical("Metafeatures encoded not calculated")

        # == Set up a directory where all the trained models will be pickled to
        if self.keep_models:
            self.model_directory_ = os.path.join(self.tmp_dir,
                                                 "models_%d" % self.seed)
            os.mkdir(self.model_directory_)
        self.ensemble_indices_directory_ = os.path.join(self.tmp_dir,
                                                        "ensemble_indices_%d" % self.seed)
        os.mkdir(self.ensemble_indices_directory_)

        # == RUN SMAC
        self.stopwatch_.start_task("runSmac")
        # = Create an empty instance file
        instance_file = os.path.join(self.tmp_dir, "instances.txt")
        instance_file_lock = instance_file + ".lock"
        with lockfile.LockFile(instance_file_lock):
            if not os.path.exists(instance_file_lock):
                with open(instance_file, "w") as fh:
                    fh.write("holdout")
                self.logger.debug("Created instance file %s" % instance_file)
            else:
                self.logger.debug("Instance file already present at %s" % instance_file)

        # = Start SMAC
        time_left_for_smac = max(0, self.time_left_for_this_task - (
            self.stopwatch_.wall_elapsed(self.basename_)))
        self.logger.debug("Start SMAC with %5.2fsec time left" % time_left_for_smac)
        proc_smac, smac_call = \
            submit_process.run_smac(dataset_name=self.basename_,
                                    dataset=data_manager_path,
                                    tmp_dir=self.tmp_dir,
                                    searchspace=configspace_path,
                                    instance_file=instance_file,
                                    limit=time_left_for_smac,
                                    cutoff_time=self.per_run_time_limit,
                                    initial_challengers=initial_configurations,
                                    memory_limit=self.ml_memory_limit,
                                    seed=self.seed)
        self.logger.debug(smac_call)
        self.stopwatch_.stop_task("runSmac")

        # == RUN ensemble builder
        self.stopwatch_.start_task("runEnsemble")
        time_left_for_ensembles = max(0, self.time_left_for_this_task - (
            self.stopwatch_.wall_elapsed(self.basename_)))
        self.logger.debug("Start Ensemble with %5.2fsec time left" % time_left_for_ensembles)
        proc_ensembles = \
            submit_process.run_ensemble_builder(tmp_dir=self.tmp_dir,
                                                dataset_name=self.basename_,
                                                task_type=self.task_,
                                                metric=self.metric_,
                                                limit=time_left_for_ensembles,
                                                output_dir=self.output_dir,
                                                ensemble_size=self.ensemble_size,
                                                ensemble_nbest=self.ensemble_nbest,
                                                seed=self.seed,
                                                ensemble_indices_output_dir=self.ensemble_indices_directory_)
        self.stopwatch_.stop_task("runEnsemble")

        del D

        if self.queue is not None:
            self.queue.put([time_needed_to_load_data, data_manager_path,
                            proc_smac, proc_ensembles])
        else:
            proc_smac.wait()
            proc_ensembles.wait()

        # Delete AutoSklearn environment variable
        del os.environ["AUTOSKLEARN_SEED"]
        return self