def fit(self, df, options): """Do the clustering & merge labels with original data.""" # Make a copy of the input data X = df.copy() # Use the df_util prepare_features method to # - drop null columns & rows # - convert categorical columns into dummy indicator columns # X is our cleaned data, nans is a mask of the null value locations X, nans, columns = df_util.prepare_features(X, self.feature_variables) # Do the actual clustering y_hat = self.estimator.fit_predict(X.values) # attach silhouette coefficient score for each row silhouettes = silhouette_samples(X, y_hat) # Combine the two arrays, and transpose them. y_hat = np.vstack([y_hat, silhouettes]).T # Assign default output names default_name = 'cluster' # Get the value from the as-clause if present output_name = options.get('output_name', default_name) # There are two columns - one for the labels, for the silhouette scores output_names = [output_name, 'silhouette_score'] # Use the predictions & nans-mask to create a new dataframe output_df = df_util.create_output_dataframe(y_hat, nans, output_names) # Merge the dataframe with the original input data df = df_util.merge_predictions(df, output_df) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) y_hat = self.estimator.transform(X.values) mask = self.estimator.get_support() columns_select = np.array(self.columns)[mask] width = len(columns_select) if width == 0: cexc.messages.warn( 'No fields pass the current configuration. Consider changing your parameters.' ) default_name = 'fs' output_name = options.get('output_name', default_name) output_names = [output_name + '_%s' % x for x in columns_select] output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_names, ) df = df_util.merge_predictions(df, output) return df
def apply(self, df, options=None): # Handle backwards compatibility. add_missing_attr(self.estimator, attr='max_iter', value=5, param_key='n_iter') add_missing_attr(self.estimator, attr='tol', value=None) # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) scaled_X = self.scaler.transform(X.values) y_hat = self.estimator.predict(scaled_X) default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Make sure to turn off get_dummies X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, get_dummies=False, mlspl_limits=options.get('mlspl_limits'), ) X = X.values.ravel().astype('str') y_hat = self.estimator.transform(X) # Convert the returned sparse matrix into array y_hat = y_hat.toarray() output_names = self.make_output_names(options) output = df_util.create_output_dataframe( y_hat=y_hat, output_names=output_names, nans=nans, ) df = df_util.merge_predictions(df, output) return df
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, ) if len(X) > 0 and len(X) <= self.estimator.n_clusters: raise RuntimeError( "k must be smaller than the number of events used as input") scaled_X = self.scaler.fit_transform(X.values) y_hat = self.estimator.fit_predict(scaled_X) y_hat = ['' if np.isnan(v) else str('%.0f' % v) for v in y_hat] default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df = df_util.merge_predictions(df, output) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe logger = get_logger('IsolationForest Logger') X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1. y_hat = self.estimator.predict(X.values)*-1 # Printing the accuracy for prediction of outliers accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2))) logger.debug(accuracy) y_hat = y_hat.astype('str') #Assign output_name default_name = 'isOutlier' new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the features X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) # Call the transform method y_hat = self.estimator.fit_transform(X.values) # Assign output_name output_name = options.get('output_name', None) default_names = self.make_output_names( output_name=output_name, n_names=y_hat.shape[1], ) output_names = self.rename_output(default_names, output_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_names, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) y_hat = self.estimator.predict(X.values) # Ensure the output has no floating points y_hat = y_hat.astype('str') # Assign output_name default_name = 'cluster' new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the dataset X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Make predictions y_hat = self.estimator.predict(X.values) # Assign output_name default_name = 'predicted({})'.format(self.target_variable) new_name = options.get('output_name', None) output_name = self.rename_output(default_names=default_name, new_names=new_name) # Create output dataframe output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) # Merge with original dataframe output = df_util.merge_predictions(df, output) return output
def fit(self, df, options): """Compute the polynomial features and return a DataFrame""" (X, nans, columns) = prepare_features(df.copy(), self.feature_variables) X_hat = DataFrame(self.preprocessor.fit_transform(X), columns=self.get_feature_names(columns)) return merge_predictions(df, X_hat)
def apply(self, df, options): """Apply is overridden to add additional 'cluster_distance' column.""" # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) y_hat = self.estimator.predict(X.values) default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df_values = X[self.columns].values cluster_ctrs = self.estimator.cluster_centers_ dist = [ np.nan if np.isnan(cluster) else np.sum( np.square(cluster_ctrs[cluster] - row)) for (cluster, row) in izip(y_hat, df_values) ] dist_df = df_util.create_output_dataframe( y_hat=dist, nans=nans, output_names='cluster_distance', ) output = df_util.merge_predictions(output, dist_df) df = df_util.merge_predictions(df, output) df[output_name] = df[output_name].apply(lambda c: '' if np.isnan(c) else int(c)) return df
def fit(self, df, options): X = df.copy() X, nans, columns = df_util.prepare_features(X, self.feature_variables) def f(x): return savgol_filter(x, self.window_length, self.polyorder, self.deriv) y_hat = np.apply_along_axis(f, 0, X) names = ['SG_%s' % col for col in columns] output_df = df_util.create_output_dataframe(y_hat, nans, names) df = df_util.merge_predictions(df, output_df) return df
def apply(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() # Prepare the dataset X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, mlspl_limits=options.get('mlspl_limits'), ) # Make predictions y_hat = self.estimator.predict(X.values) # Assign output_name default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) # Create output output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) if self.check_probabilities(options): # predict probs y_hat_proba = self.estimator.predict_proba(X.values) # get names class_names = [ 'probability({}={})'.format(self.target_variable, cls_name) for cls_name in self.estimator.classes_ ] # create output data frame output_proba = df_util.create_output_dataframe( y_hat=y_hat_proba, nans=nans, output_names=class_names, ) # combine output = pd.concat([output, output_proba], axis=1) df = df_util.merge_predictions(df, output) return df
def fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, _ = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits') ) y_hat = self.estimator.fit_predict(X.values) default_name = 'cluster' output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name ) df = df_util.merge_predictions(df, output) return df
def apply(self, df, options=None): # Make a copy of data, to not alter original dataframe X = df.copy() X, nans, columns = df_util.prepare_features( X=X, variables=self.feature_variables, final_columns=self.columns, ) scaled_X = self.scaler.transform(X.values) y_hat = self.estimator.predict(scaled_X) default_name = 'predicted({})'.format(self.target_variable) output_name = options.get('output_name', default_name) output = df_util.create_output_dataframe( y_hat=y_hat, nans=nans, output_names=output_name, ) df = df_util.merge_predictions(df, output) return df