Esempio n. 1
0
    def landmark_decision_tree(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            import sklearn.tree

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            for train, test in kf.split(X, y):
                random_state = sklearn.utils.check_random_state(42)
                tree = sklearn.tree.DecisionTreeClassifier(
                    random_state=random_state)

                if len(y.shape) == 1 or y.shape[1] == 1:
                    tree.fit(X[train], y[train])
                else:
                    tree = OneVsRestClassifier(tree)
                    tree.fit(X[train], y[train])

                predictions = tree.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Decision Tree could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 2
0
    def class_probability_std(y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            occurence_dict = StatisticalInformation.class_ocurrences(y)

            if len(y.shape) == 2:
                stds = []
                for i in range(y.shape[1]):
                    std = np.array([
                        occurrence
                        for occurrence in occurence_dict[i].values()
                    ],
                                   dtype=np.float64)
                    std = (std / y.shape[0]).std()
                    stds.append(std)
                return np.mean(stds)
            else:
                occurences = np.array(
                    [occurrence for occurrence in occurence_dict.values()],
                    dtype=np.float64)
                return (occurences / y.shape[0]).std()
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Class probability std could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 3
0
    def landmark_random_node_learner(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            import sklearn.tree

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)
            accuracy = 0.

            for train, test in kf.split(X, y):
                random_state = sklearn.utils.check_random_state(42)
                node = sklearn.tree.DecisionTreeClassifier(
                    criterion="entropy",
                    max_depth=1,
                    random_state=random_state,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_features=1)
                node.fit(X[train], y[train])
                predictions = node.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Random Tree Node Learner could not be computed. \
Returning 0 instead. Originally failed with exception '{ex}'".format(ex=ex),
                'WARNING')
            return 0.
Esempio n. 4
0
    def add_attribute_key(self, column=None):
        """Add an attribute to the list of keys.

        A key is the identifier in the dataset, similar to what we call primary
        key in Databases.

        Args:
            column (str): The column to add as a key in the dataset. Defaults
                to `None`.

        Raises:
            ValueError: If column is None or not in the attributes.

        """
        if column is None:
            raise ValueError("column cannot be None")

        if column not in self.attribute_names():
            raise ValueError("Invalid column. Column is not in the dataset")

        if column in self.key_attributes:
            log_msg = "Column '{col}' already existed in key_attributes. \
                      Skipping ...".format(col=column)

            automl.automl_log(log_msg, 'WARNING')
        else:
            self.key_attributes.append(column)
Esempio n. 5
0
    def skewnesses(X, categorical_indicators):  # pylint: disable=C0103
        """Compute statistic."""
        if scipy.sparse.issparse(X):
            skews = []
            X_new = X.tocsc()  # pylint: disable=C0103
            for i in range(X_new.shape[1]):
                if not categorical_indicators[i]:
                    start = X_new.indptr[i]
                    stop = X_new.indptr[i + 1]
                    try:
                        skews.append(scipy.stats.skew(X_new.data[start:stop]))
                    except Exception as ex:  # pylint: disable=W0703
                        automl_log(
                            "Skewness of row {i} could not be computed. \
Returning 0 instead. Originally failed with exception \
'{ex}'".format(i=i, ex=ex), 'WARNING')
                        skews.append(0)
            return skews
        else:
            skews = []
            for i in range(X.shape[1]):
                if not categorical_indicators[i]:
                    try:
                        skews.append(scipy.stats.skew(X[:, i]))
                    except Exception as ex:  # pylint: disable=W0703
                        automl_log(
                            "Skewness of row {i} could not be computed. \
Returning 0 instead. Originally failed with exception \
'{ex}'".format(i=i, ex=ex), 'WARNING')
                        skews.append(0)
            return skews
Esempio n. 6
0
    def landmark_naive_bayes(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            import sklearn.naive_bayes

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            for train, test in kf.split(X, y):
                nb = sklearn.naive_bayes.GaussianNB()  # pylint: disable=C0103

                if len(y.shape) == 1 or y.shape[1] == 1:
                    nb.fit(X[train], y[train])
                else:
                    nb = OneVsRestClassifier(nb)  # pylint: disable=C0103
                    nb.fit(X[train], y[train])

                predictions = nb.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Naive Bayes could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 7
0
    def landmark_1NN(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            import sklearn.neighbors

            # pylint: disable=C0103
            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            for train, test in kf.split(X, y):
                kNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)
                if len(y.shape) == 1 or y.shape[1] == 1:
                    kNN.fit(X[train], y[train].ravel())
                else:
                    kNN = OneVsRestClassifier(kNN)
                    kNN.fit(X[train], y[train])
                predictions = kNN.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(
                    predictions, y[test])
            return accuracy / 10
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark 1NN could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 8
0
    def number_of_categorical_features(categorical_indicators):
        """Compute statistic."""
        try:
            return np.sum(categorical_indicators)
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark Decision Node Learner could not be computed. \
Returning 0 instead. Originally failed with exception '{ex}'".format(ex=ex),
                'WARNING')
            return 0.
Esempio n. 9
0
    def kurtosis_min(X, categorical_indicators):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            kurts = StatisticalInformation.kurtosisses(X,
                                                       categorical_indicators)
            # pylint: disable=C1801
            minimum = np.nanmin(kurts) if len(kurts) > 0 else 0
            return minimum if np.isfinite(minimum) else 0
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Kurtosis min could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 10
0
    def skewness_std(X, categorical_indicators):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            skews = StatisticalInformation.skewnesses(X,
                                                      categorical_indicators)
            # pylint: disable=C1801
            std = np.nanstd(skews) if len(skews) > 0 else 0
            return std if np.isfinite(std) else 0
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Skewness std could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 11
0
    def metafeatures_vector(self):
        """Return the metafeatures of this dataset as a vector (numpy array).

        Returns:
            np.array: The metafeatures as a numpy array.

        """
        res = MetaFeaturesManager(self).metafeatures_as_numpy_array()
        if np.count_nonzero(~np.isnan(res)) > 0:
            automl_log("It was not possible to compute all metafeatures (see \
log messages). We will replace the NaN values in the meta-features vector \
with 0", 'WARNING')
            np.nan_to_num(res, False)
        return res
Esempio n. 12
0
    def _internal_validation(self):
        # Check for missing values
        nan_sum_x = self.X.isna().sum().sum()
        nan_sum_y = self.y.isna().sum().sum()

        # Log missing values messages
        if nan_sum_x > 0:
            automl_log("Features set (X) in dataset '{d_id}' contains {n_na} \
missing values. Please not that with missing data, results can be innacurate. \
Fix the missing values yourself.".format(d_id=self.dataset_id,
                                         n_na=nan_sum_x),
                       'WARNING')

        if nan_sum_y > 0:
            automl_log("Target set (y) in dataset '{d_id}' contains {n_na} \
missing values. Please not that with missing data, results can be innacurate. \
Fix the missing values yourself.".format(d_id=self.dataset_id,
                                         n_na=nan_sum_y),
                       'WARNING')
        # Check for inifinite values
        inf_sum_x = np.isinf(self.X.values).ravel().sum()
        inf_sum_y = np.isinf(self.y.values).ravel().sum()

        # Log infinite values messages
        if inf_sum_x > 0:
            automl_log("Features set (X) in dataset '{d_id}' contains {n_na} \
missing values. Please not that with missing data, results can be innacurate. \
Fix the missing values yourself.".format(d_id=self.dataset_id,
                                         n_na=nan_sum_x))

        if inf_sum_y > 0:
            automl_log("Target set (y) in dataset '{d_id}' contains {n_na} \
missing values. Please not that with missing data, results can be innacurate. \
Fix the missing values yourself.".format(d_id=self.dataset_id,
                                         n_na=nan_sum_y))
Esempio n. 13
0
    def build_configuration(self):
        """Build a ML Suggestion with the row passed at instaciation time.

        Returns:
            MLSuggestion: A suggestion with classifiers, pre-processors,
                scalers, encoders and imputation methods.

        """
        imputation_col = _PF_IMPUTATION + _CSV_COL_SEP + _PF_STRATEGY
        classifier_choice_col = _PF_CLASSIFIER + _CSV_COL_SEP + _CSV_CHOICE
        preprocessor_choice_col = _PF_PREPROCESSOR + _CSV_COL_SEP + _CSV_CHOICE
        rescaler_choice_col = _PF_RESCALING + _CSV_COL_SEP + _CSV_CHOICE
        encoding_choice_col = _PF_CATEGORICAL_ENCODING + _CSV_COL_SEP + \
            _CSV_CHOICE

        suggestions_dict = {
            _PF_CLASSIFIER: [],
            _PF_PREPROCESSOR: [],
            _PF_RESCALING: [],
            _PF_CATEGORICAL_ENCODING: [],
            _PF_IMPUTATION: [],
        }

        for attribute in [
                imputation_col, classifier_choice_col, preprocessor_choice_col,
                rescaler_choice_col, encoding_choice_col
        ]:
            if attribute in self.model_row.index:
                list_name = attribute.split(_CSV_COL_SEP)[0]
                suggestion = self._from_internal_list(attribute)
                suggestions_dict[list_name].append(suggestion)
            else:
                msg = "No attribute '{attr}' in current element\
                      ".format(attr=attribute)
                automl.automl_log(msg, 'WARNING')

        mlsuggestion = MLSuggestion(
            classifiers=suggestions_dict[_PF_CLASSIFIER],
            preprocessors=suggestions_dict[_PF_PREPROCESSOR],
            encoders=suggestions_dict[_PF_CATEGORICAL_ENCODING],
            rescalers=suggestions_dict[_PF_RESCALING],
            imputations=suggestions_dict[_PF_IMPUTATION],
        )

        return mlsuggestion
Esempio n. 14
0
    def landmark_lda(X, y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if scipy.sparse.issparse(X):
                return np.NaN

            # pylint: disable=C0103
            import sklearn.model_selection
            from sklearn.discriminant_analysis \
                import LinearDiscriminantAnalysis

            if len(y.shape) == 1 or y.shape[1] == 1:
                kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
            else:
                kf = sklearn.model_selection.KFold(n_splits=10)

            accuracy = 0.
            try:
                for train, test in kf.split(X, y):
                    lda = LinearDiscriminantAnalysis()

                    if len(y.shape) == 1 or y.shape[1] == 1:
                        lda.fit(X[train], y[train])
                    else:
                        lda = OneVsRestClassifier(lda)
                        lda.fit(X[train], y[train])

                    predictions = lda.predict(X[test])
                    accuracy += sklearn.metrics.accuracy_score(
                        predictions, y[test])
                return accuracy / 10
            except scipy.linalg.LinAlgError as ex:
                # pylint: disable=W1201
                logging.warning("LDA failed: %s Returned 0 instead!" % ex)
                return np.NaN
            except ValueError as ex:
                # pylint: disable=W1201
                logging.warning("LDA failed: %s Returned 0 instead!" % ex)
                return np.NaN
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Landmark LDA could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 15
0
    def _init_attributes(self, arff_dataset):
        """Initialize the attributes of the class.

        We map each of the arff dict keys into an attribute.
        """
        self.description = arff_dataset['description']
        self.name = arff_dataset['relation']
        self.data = self._parse_data(arff_dataset)
        if arff_dataset['attributes']:
            if isinstance(arff_dataset['attributes'][0], tuple):
                self.key_attributes = [arff_dataset['attributes'][0][0]]
            else:
                automl.automl_log(
                    "First element in 'attributes' is not a tuple'. Skipping \
'key_attributes' assignment.", 'WARNING')
        else:
            automl.automl_log(
                "'attributes' field is an empty list. Errors may occur. \
Skipping 'key_attributes' assignment.", 'WARNING')
Esempio n. 16
0
    def class_ocurrences(y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            if len(y.shape) == 2:
                occurences = []
                for i in range(y.shape[1]):
                    occurences.append(
                        StatisticalInformation.class_ocurrences(y[:, i]))
                return occurences
            else:
                occurence_dict = defaultdict(float)
                for value in y:
                    occurence_dict[value] += 1
                return occurence_dict
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Class ocurrences could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 17
0
    def class_probability_min(y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            occurences = StatisticalInformation.class_ocurrences(y)

            min_value = np.iinfo(np.int64).max
            if len(y.shape) == 2:
                for i in range(y.shape[1]):
                    for num_occurences in occurences[i].values():
                        if num_occurences < min_value:
                            min_value = num_occurences
            else:
                for num_occurences in occurences.values():
                    if num_occurences < min_value:
                        min_value = num_occurences
            return float(min_value) / float(y.shape[0])
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Class probability min could not be computed. Returning 0 \
instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 18
0
    def pca(X):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            import sklearn.decomposition
            rs = np.random.RandomState(42)  # pylint: disable=C0103
            indices = np.arange(X.shape[0])

            if scipy.sparse.issparse(X):
                pca = sklearn.decomposition.PCA(copy=True)
                for i in range(10):
                    try:
                        rs.shuffle(indices)
                        pca.fit(X[indices])
                        return pca
                    except LinAlgError:
                        pass
                automl_log("Failed to compute a Principle Component Analysis",
                           'WARNING')
                return None
            else:
                # This is expensive, but necessary with scikit-learn 0.15
                Xt = X.astype(np.float64)  # pylint: disable=C0103
                for i in range(10):
                    try:
                        rs.shuffle(indices)
                        truncated_svd = sklearn.decomposition.TruncatedSVD(
                            n_components=X.shape[1] - 1,
                            random_state=i,
                            algorithm="randomized")
                        truncated_svd.fit(Xt[indices])
                        return truncated_svd
                    except LinAlgError:
                        pass
                logging.warning("Failed to compute a Truncated SVD")
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "PCA could not be computed. Returning None instead. Originally\
 failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return None
Esempio n. 19
0
    def class_entropy(y):  # pylint: disable=C0103
        """Compute statistic."""
        try:
            labels = 1 if len(y.shape) == 1 else y.shape[1]
            if labels == 1:
                y = y.reshape((-1, 1))

            entropies = []
            for i in range(labels):
                occurence_dict = defaultdict(float)
                for value in y[:, i]:
                    occurence_dict[value] += 1
                entropies.append(
                    scipy.stats.entropy(
                        [occurence_dict[key] for key in occurence_dict],
                        base=2))

            return np.mean(entropies)
        except Exception as ex:  # pylint: disable=W0703
            automl_log(
                "Class entropy could not be computed. Returning 0 instead. \
Originally failed with exception '{ex}'".format(ex=ex), 'WARNING')
            return 0.
Esempio n. 20
0
    def get_openml_dataset(openml_id, problem_type):
        """Fetch a dataset from OpenML and return a Dataset object.

        Args:
            openml_id (int): ID for the dataset, as stored in OpenML.
            problem_type (int): Type of problem to solve in the dataset.
                0 for classification, 1 for regression.

        Returns:
            Dataset: An auto-ml Dataset, as defined in this module.
                Its default ID will be the concatenation of the OpenML dataset
                name and ID.

        """
        automl_log(
            "Loading dataset {d_id} from OpenML:".format(d_id=openml_id),
            'INFO')
        openml_dataset = oml.datasets.get_dataset(openml_id)
        features, target, categorical_indicators, attribute_names = \
            openml_dataset.get_data(
                target=openml_dataset.default_target_attribute,
                return_attribute_names=True,
                return_categorical_indicator=True
            )

        if scipy.sparse.issparse(features):
            raise CurrentlyNonSupportedError("Sparse datasets are not \
supported yet in Achmea's auto-ml solution")

        features = pd.DataFrame(features, columns=attribute_names)
        target = pd.DataFrame(target, columns=[_TARGET_NAME])

        return Dataset(dataset_id="{}-{}".format(openml_dataset.dataset_id,
                                                 openml_dataset.name),
                       X=features, y=target,
                       categorical_indicators=categorical_indicators,
                       problem_type=problem_type)
Esempio n. 21
0
    def models_by_metric(instances_ids=None, dataset=None, metric='accuracy'):
        """Return the models for a list of instances by the given accuracy.

        Attributes:
            instances_ids (list): List of integers with the ids of the
                instances (datasets).
            dataset (Dataset): The dataset to work with.
            metric (str) Name of the metric to use. It must be one of the
                metrics returned by LandmarkModelParser.metrics_available().
        Results:
            list: List of models. One element per instance.

        """
        # Validation of arguments
        if instances_ids is None:
            raise ValueError("A list of instances' ids must be specified.")

        if dataset is None:
            raise ValueError("Please provide a valid dataset.")

        if not isinstance(dataset, Dataset):
            raise TypeError("Dataset must be of type Dataset (automl pkg)")

        if dataset.is_regression_problem():
            raise CurrentlyNonSupportedError("Meta-learning for regression is \
                                             not supported yet")

        # Create helper variables
        if dataset.is_classification_problem():
            problem_type = "classification"
            classif_type = "multiclass" if dataset.n_labels > 2 else "binary"
        # sparse or not
        data_type = "sparse" if dataset.is_sparse() else "dense"

        # metric to use is composed of `metric`_`binary/multiclass`. e.g.
        # accuracy_binary
        internal_metric = "{me}_{c_type}".format(me=metric,
                                                 c_type=classif_type)
        # problem_description is classification_`sparse/dense`. E.g.
        # classficiation_sparse
        problem_desc = "{p_type}_{d_type}".format(p_type=problem_type,
                                                  d_type=data_type)

        # Then the final basename_dir (name of the metric in auto-sklearn) is
        # the mix of the above. E.g. accuracy_binary_classficiation_sparse
        basename_dir = \
            "{metric}.{problem}".format(metric=internal_metric,
                                        problem=problem_desc)

        # Get the available metrics to validate the resolved metric is part of
        # list
        metrics_available = LandmarkModelParser.metrics_available()
        if internal_metric not in metrics_available:
            raise ValueError("Metric '{argument}' is not supported. Try any \
                             of the following metrics: {available}".format(
                argument=metric, available=metrics_available))

        # Get the corresponding configurations.csv file path
        configs_csv = LandmarkModelParser._configs_file_by_metric(basename_dir)
        # Get the corresponding algorithm_runs.arff file path
        algoruns_arff = \
            LandmarkModelParser._algorithm_runs_file_by_metric(basename_dir)

        # Validate we found valid files
        if configs_csv is None or algoruns_arff is None:
            raise ValueError("Some of the meta-learning files was not found \
in the database for metric '{metric}'".format(metric=basename_dir))

        # Start to resolve the result
        res = []
        # For each of the instances requested
        for instance_id in instances_ids:
            # instanciate the correspondant algorithm runs file
            try:
                algoruns_file = AlgorithmRunsFile(algoruns_arff)
                # get the configuration id for that instance
                config_id = \
                    algoruns_file.get_associated_configuration_id(instance_id)

                # And then load the configurations.csv file
                config_file = ConfigurationsFile(configs_csv)

                # Resolve the configuration as a list
                mmb = ConfigurationBuilder(
                    config_file.get_configuration(config_id))
                # and append it to the list
                res.append(mmb.build_configuration())
            except ValueError:
                automl_log(
                    "Instance (dataset) with id={inst_id} has no \
meta-knowledge associated for metric '{metric}'. We will ignore this dataset \
and you should expect fewer ML Suggestions.".format(inst_id=instance_id,
                                                    metric=basename_dir),
                    'WARNING')
        return res