def get_reduced_embeddings_df(data, embedder: embeddings.EmbeddingVectorizer,
                              reducer: base.BaseEstimator):
    """
    run feature extraction with `embedder` and
    then dimensionality reduction with `reducer`
    """
    data_embeddings = embedder.transform(data)
    reduced_task_embeddings = reducer.fit_transform(data_embeddings)
    return reduced_task_embeddings
Exemple #2
0
    def _impute(self, imputer: BaseEstimator, target: str) -> None:
        """Impute any missing data within 'numeric_features'.

        This method skips imputing the target variable.

        Args:
            imputer (BaseEstimator): Class instance to impute the data. Must have valid
                    'fit_transform' method.
            target (str): Column name for the target variable.

        """
        numeric_features_wo_target = list(
            set(self.numeric_features) - set([target]))
        if numeric_features_wo_target:
            self.processed_data.loc[:,
                                    numeric_features_wo_target] = imputer.fit_transform(
                                        self.processed_data.
                                        loc[:, numeric_features_wo_target])
Exemple #3
0
    def _scale_data(self, scaler: BaseEstimator, target: str,
                    scale_target: bool) -> None:
        """Scale numeric features.

        This method can either be used to scale the target variable or not.

        Args:
            scaler (BaseEstimator): Class instance to scale the data. Must have valid
                    'fit_transform' method.
            target (str): Column name of target variable.
            scale_target (bool): Whether to scale the target variable or not.

        """
        if scale_target:
            features_to_scale = self.numeric_features
        else:
            features_to_scale = list(
                set(self.numeric_features) - set([target]))

        if features_to_scale:
            self.processed_data.loc[:,
                                    features_to_scale] = scaler.fit_transform(
                                        self.processed_data.
                                        loc[:, features_to_scale])
    def transform_dataset(self, algorithm: BaseEstimator, n_folds: int = 5) -> Tuple[pd.DataFrame, Dict[str, float]]:
        """
        Given a set of fully-qualified hyperparameters, create and not working a algorithm model.
        Returns: Model object and metrics dictionary
        """

        """Load input dataset and class_column"""
        df = self.dataset.load(self.s3_config, self.s3_bucket)
        class_column = self.dataset.class_column

        """Split input dataset in X and y"""
        X, y = df.drop(class_column, axis=1), df[class_column]

        """
        Checks if algorithm (BaseEstimator) is a classifier. 
        
        If True, predict y_pred with the method cross_val_predict. Then calculate the evaluation metrics for the
        algorithm model and return them as a dict. Convert y_pred to pd Series and concatenate X & y_pred.
        
        If False, call fit_transform or fit and then transform on X, y and return the transformed dataset as Dataframe.
        """

        if is_classifier(algorithm):

            """Predict labels with n fold cross validation"""
            y_pred = cross_val_predict(algorithm, X, y, cv=n_folds)

            """Calculate evaluation metrics"""
            accuracy = accuracy_score(y, y_pred)
            precision = precision_score(y, y_pred, average='weighted')
            recall = recall_score(y, y_pred, average='weighted')
            f1 = f1_score(y, y_pred, average='weighted')
            # TODO
            log_loss = logloss(y, y_pred)
            roc_auc = multiclass_roc_auc_score(y, y_pred, average='weighted')

            """Convert np array y_pred to pd series and add it to X"""
            y_pred = pd.Series(y_pred)
            X = pd.concat([X, y_pred], axis=1)
            X.columns = range(X.shape[1])

            return X, {'accuracy': accuracy,
                       'precision': precision,
                       'recall': recall,
                       'f1': f1,
                       'neg_log_loss': log_loss,
                       'roc_auc': roc_auc
                       }
        else:
            """
            If algorithm object has method fit_transform, call fit_transform on X, y. Else, first call fit on X, y,
            then transform on X. Safe the transformed dataset in X
            """
            if hasattr(algorithm, 'fit_transform'):
                X = algorithm.fit_transform(X, y)
            else:
                # noinspection PyUnresolvedReferences
                X = algorithm.fit(X, y).transform(X)

            X = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1]))

            return X, {}