Beispiel #1
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()
        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )
        y_hat = self.estimator.transform(X.values)
        mask = self.estimator.get_support()
        columns_select = np.array(self.columns)[mask]
        width = len(columns_select)

        if width == 0:
            cexc.messages.warn(
                'No fields pass the current configuration. Consider changing your parameters.'
            )

        default_name = 'fs'
        output_name = options.get('output_name', default_name)
        output_names = [output_name + '_%s' % x for x in columns_select]

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_names,
        )

        df = df_util.merge_predictions(df, output)
        return df
Beispiel #2
0
    def apply(self, df, options=None):
        # Handle backwards compatibility.
        add_missing_attr(self.estimator,
                         attr='max_iter',
                         value=5,
                         param_key='n_iter')
        add_missing_attr(self.estimator, attr='tol', value=None)

        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )

        scaled_X = self.scaler.transform(X.values)
        y_hat = self.estimator.predict(scaled_X)

        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        output = df_util.merge_predictions(df, output)
        return output
Beispiel #3
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        X = X.values.ravel().astype('str')
        y_hat = self.estimator.transform(X)

        # Convert the returned sparse matrix into array
        y_hat = y_hat.toarray()

        output_names = self.make_output_names(options)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            output_names=output_names,
            nans=nans,
        )

        df = df_util.merge_predictions(df, output)
        return df
Beispiel #4
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
        )

        if len(X) > 0 and len(X) <= self.estimator.n_clusters:
            raise RuntimeError(
                "k must be smaller than the number of events used as input")

        scaled_X = self.scaler.fit_transform(X.values)
        y_hat = self.estimator.fit_predict(scaled_X)
        y_hat = ['' if np.isnan(v) else str('%.0f' % v) for v in y_hat]

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        df = df_util.merge_predictions(df, output)
        return df
Beispiel #5
0
    def fit(self, df, options):
        """Do the clustering & merge labels with original data."""
        # Make a copy of the input data
        X = df.copy()

        # Use the df_util prepare_features method to
        # - drop null columns & rows
        # - convert categorical columns into dummy indicator columns
        # X is our cleaned data, nans is a mask of the null value locations
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        # Do the actual clustering
        y_hat = self.estimator.fit_predict(X.values)

        # attach silhouette coefficient score for each row
        silhouettes = silhouette_samples(X, y_hat)

        # Combine the two arrays, and transpose them.
        y_hat = np.vstack([y_hat, silhouettes]).T

        # Assign default output names
        default_name = 'cluster'

        # Get the value from the as-clause if present
        output_name = options.get('output_name', default_name)

        # There are two columns - one for the labels, for the silhouette scores
        output_names = [output_name, 'silhouette_score']

        # Use the predictions & nans-mask to create a new dataframe
        output_df = df_util.create_output_dataframe(y_hat, nans, output_names)

        # Merge the dataframe with the original input data
        df = df_util.merge_predictions(df, output_df)
        return df
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        logger = get_logger('IsolationForest Logger')
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
        y_hat = self.estimator.predict(X.values)*-1
        # Printing the accuracy for prediction of outliers
        accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
        logger.debug(accuracy)
        
        y_hat = y_hat.astype('str')

        #Assign output_name
        default_name = 'isOutlier'
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name, new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
Beispiel #7
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the features
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )

        # Call the transform method
        y_hat = self.estimator.fit_transform(X.values)

        # Assign output_name
        output_name = options.get('output_name', None)
        default_names = self.make_output_names(
            output_name=output_name,
            n_names=y_hat.shape[1],
        )
        output_names = self.rename_output(default_names, output_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_names,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
Beispiel #8
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        y_hat = self.estimator.predict(X.values)

        # Ensure the output has no floating points
        y_hat = y_hat.astype('str')

        # Assign output_name
        default_name = 'cluster'
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name,
                                         new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
Beispiel #9
0
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the dataset
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        # Make predictions
        y_hat = self.estimator.predict(X.values)

        # Assign output_name
        default_name = 'predicted({})'.format(self.target_variable)
        new_name = options.get('output_name', None)
        output_name = self.rename_output(default_names=default_name,
                                         new_names=new_name)

        # Create output dataframe
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        # Merge with original dataframe
        output = df_util.merge_predictions(df, output)
        return output
    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Prepare the dataset
        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        # Make predictions
        y_hat = self.estimator.predict(X.values)

        # Assign output_name
        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        # Create output
        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        if self.check_probabilities(options):
            # predict probs
            y_hat_proba = self.estimator.predict_proba(X.values)

            # get names
            class_names = [
                'probability({}={})'.format(self.target_variable, cls_name)
                for cls_name in self.estimator.classes_
            ]

            # create output data frame
            output_proba = df_util.create_output_dataframe(
                y_hat=y_hat_proba,
                nans=nans,
                output_names=class_names,
            )
            # combine
            output = pd.concat([output, output_proba], axis=1)

        df = df_util.merge_predictions(df, output)
        return df
Beispiel #11
0
    def apply(self, df, options):
        """Apply is overridden to add additional 'cluster_distance' column."""
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            mlspl_limits=options.get('mlspl_limits'),
        )
        y_hat = self.estimator.predict(X.values)

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )
        df_values = X[self.columns].values
        cluster_ctrs = self.estimator.cluster_centers_

        dist = [
            np.nan if np.isnan(cluster) else np.sum(
                np.square(cluster_ctrs[cluster] - row))
            for (cluster, row) in izip(y_hat, df_values)
        ]

        dist_df = df_util.create_output_dataframe(
            y_hat=dist,
            nans=nans,
            output_names='cluster_distance',
        )

        output = df_util.merge_predictions(output, dist_df)
        df = df_util.merge_predictions(df, output)
        df[output_name] = df[output_name].apply(lambda c: ''
                                                if np.isnan(c) else int(c))
        return df
Beispiel #12
0
    def fit(self, df, options):
        X = df.copy()
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        def f(x):
            return savgol_filter(x, self.window_length, self.polyorder,
                                 self.deriv)

        y_hat = np.apply_along_axis(f, 0, X)

        names = ['SG_%s' % col for col in columns]
        output_df = df_util.create_output_dataframe(y_hat, nans, names)
        df = df_util.merge_predictions(df, output_df)

        return df
Beispiel #13
0
    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, _ = df_util.prepare_features(
            X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
        )

        y_hat = self.estimator.fit_predict(X.values)

        default_name = 'cluster'
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        df = df_util.merge_predictions(df, output)
        return df
Beispiel #14
0
    def apply(self, df, options=None):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        X, nans, columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
        )

        scaled_X = self.scaler.transform(X.values)
        y_hat = self.estimator.predict(scaled_X)

        default_name = 'predicted({})'.format(self.target_variable)
        output_name = options.get('output_name', default_name)

        output = df_util.create_output_dataframe(
            y_hat=y_hat,
            nans=nans,
            output_names=output_name,
        )

        df = df_util.merge_predictions(df, output)
        return df