def evaluate(self, array_datas): """ Create a scatter plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return result_object if len(df.columns) <= 1: Printer.Print("There needs to be atleast two variables to perform multiscatter plot!") return result_object win = Window.window() f = win.gcf() ax = f.add_subplot(111) if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax) else: gt1 = pd.Series(StatContainer.filterGroundTruth()) df, gt1 = DataGuru.removenan(df, gt1) lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size))) row_colors = gt1.map(lut) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax) f.suptitle(cname) win.show() return VizContainer.createResult(win, array_datas, ['multiscatter'])
def evaluate(self, data_frame, classifier_algo): """ Train a classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) # Get the classifier model model = classifier_algo.data[0] # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier Printer.Print("Training the classifier") df_show = pd.DataFrame() df_show['Features'] = df.columns TablePrinter.printDataFrame(df_show) model.fit(X, Y) # Print an update Printer.Print("The classifier", classifier_algo.name, "has been trained") predictions = model.predict(X) accuracy = metrics.accuracy_score(predictions, Y) Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy)) trained_model = {'Scaler': scaler, 'Model': model} result_object = ResultObject(trained_model, [], DataType.trained_model, CommandStatus.Success) classifier_algo_name = classifier_algo.name.replace('.', ' ') result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name, set_keyword_list=True) return result_object
def evaluate(self, data_frame, classifier_model): """ Run a trained classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame sns.set(color_codes=True) #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) df = data_frame.data # if command_status == CommandStatus.Error: # return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth to", "to get the prediction accuracy") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.ground_truth.data # Remove nans: df, Y = DataGuru.removenan(df, Y) X = df.values # Get the classifier model trained_model = classifier_model.data model = trained_model['Model'] scaler = trained_model['Scaler'] # Scale the values based on the training standardizer X = scaler.transform(X) # Code to run the classifier # Plot the classification result win = Window.window() f = win.gcf() ax = f.add_subplot(111) Printer.Print('Running the trained classifier...') predictions = model.predict(X) accuracy = metrics.accuracy_score(predictions, Y) Printer.Print("Accuracy : %s" % "{0:.3%}".format(accuracy)) cm = metrics.confusion_matrix(Y, predictions) DataGuru.plot_confusion_matrix(cm, np.unique(Y), ax, title="confusion matrix") win.show() # TODO Need to save the result result_object = ResultObject(None, None, None, CommandStatus.Success) return result_object
def evaluate(self, data_frame, array_datas): """ Run Isomap on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data df = DataGuru.convertStrCols_toNumeric(df) cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() df, Y = DataGuru.removenan(df, Y) # Remove nans: else: df.dropna(inplace=True) # Get the Isomap model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier win = Window.window() properties = self.createDefaultProperties() properties['title'] = cname # return ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: result_object = VizContainer.createResult(win, data_frame, ['ismp']) else: result_object = VizContainer.createResult(win, array_datas, ['ismp']) result_object.data = [win, properties, [X, Y], self.updateFigure] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, data_frame, target): """ Use one of the models to identify the top predictors """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num = int(numbers[0].data) else: num = 10 # If not specified select top 10 features X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) model = RandomForestClassifier(n_estimators=100) model.fit(X, Y) featImpVals = model.feature_importances_ featimp = pd.Series(featImpVals, index=df.columns).sort_values(ascending=False) df_show = pd.DataFrame() df_show['top features'] = featimp.index[0:num] df_show['feature importance'] = featimp.values[0:num] TablePrinter.printDataFrame(df_show) df_new = df[featimp.index[0:num]] result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) command_name = 'top.predictors' result_object.createName(data_frame.name, command_name=command_name, set_keyword_list=True) return result_object
def preEvaluate(self, data_frame, array_datas, classifier_algo): result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame sns.set(color_codes=True) if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: print("Error in getting dataframe!") result_object.data = "Error in getting dataframe!" return result_object else: result_object.data = "Please provide data frame or arrays to analyze" return result_object # Get the ground truth array if StatContainer.ground_truth is None: result_object.data = ("Please set a feature vector to ground truth by" + "typing set ground truth before using this command") return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.ground_truth.data # Remove nans: df, Y = DataGuru.removenan(df, Y) # Get the classifier model model = classifier_algo.data[0] # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) properties = self.createDefaultProperties() properties['title'] = cname cv_output = self.performCV(properties, X, Y, model) aux_output = (properties, [X, Y, model]) return [ResultObject(cv_output, None), ResultObject(aux_output, None)]
def evaluate(self, array_datas): """ Create a scatter plot between two variables """ sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print("please try the following command:", "Visualize comparison between...") return ResultObject(None, None, None, CommandStatus.Error) properties = self.createDefaultProperties() properties['title'] = cname win = Window.window() row_colors = None if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) if df.shape[0] == 0: return ResultObject(None, None, None, CommandStatus.Error) array = df.values else: gt1 = pd.Series(StatContainer.filterGroundTruth()) df, gt1 = DataGuru.removenan(df, gt1) if df.shape[0] == 0: return ResultObject(None, None, None, CommandStatus.Error) lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size))) row_colors = gt1.map(lut) array = df.values result_object = VizContainer.createResult( win, array_datas, ['scatter2d']) result_object.data = [win, properties, [ array, row_colors, kl1], self.updateFigure] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, data_frame, classifier_algos): """ Train a classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame sns.set(color_codes=True) df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) # if command_status == CommandStatus.Error: # return ResultObject(None, None, None, CommandStatus.Error) # Get the ground truth array if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.ground_truth.data # Remove nans: df, Y = DataGuru.removenan(df, Y) Printer.Print("Training classifier using the following features:") Printer.Print(df.columns) # Get all the classifier models to test against each other modelList = [] Printer.Print("Testing the following classifiers: ") for classifier_algo in classifier_algos: model = (classifier_algo.data[0]) Printer.Print(classifier_algo.name) modelList.append({'Name': classifier_algo.name, 'Model': model}) # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) Printer.Print('Finding the best classifier using', 'k fold cross validation...') all_cv_scores, all_mean_cv_scores, all_confusion_matrices = DataGuru.FindBestClassifier(X, Y, modelList, 10) Printer.Print('\n\nPlotting the confusion matrices...\n') for iter in range(len(modelList)): win = Window.window() f = win.gcf() ax = f.add_subplot(111) DataGuru.plot_confusion_matrix(all_confusion_matrices[iter], np.unique(Y), ax, title=modelList[iter]['Name']) win.show() Printer.Print("\n\nBest classifier is " + modelList[np.argmax(all_mean_cv_scores)]['Name'] + " with an accuracy of - %.2f%% " % max(all_mean_cv_scores)) # TODO Need to save the model # Ask user for a name for the model result_object = ResultObject(None, None, None, CommandStatus.Success) return result_object
def evaluate(self, data_frame, array_datas): """ Run pca on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data df = DataGuru.convertStrCols_toNumeric(df) cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) else: df.dropna(inplace=True) # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier pca = PCA(n_components=2) pca_res = pca.fit_transform(X) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if Y is None: sc = ax.scatter(pca_res[:, 0], pca_res[:, 1], cmap="jet", edgecolor="None", alpha=0.35) else: sc = ax.scatter(pca_res[:, 0], pca_res[:, 1], c=Y, cmap="jet", edgecolor="None", alpha=0.35) cbar = plt.colorbar(sc) cbar.ax.get_yaxis().labelpad = 15 cbar.ax.set_ylabel(StatContainer.ground_truth.name, rotation=270) ax.set_title(cname) win.show() # return ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: return VizContainer.createResult(win, data_frame, ['pca']) else: return VizContainer.createResult(win, array_datas, ['pca'])
def evaluate(self, data_frame, array_datas, target): """ Run clustering on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) else: df.dropna(inplace=True) # Get the tsne model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num_clusters = int(numbers[0].data) else: num_clusters = 2 # If not specified use 2 clusters kY = self.performOperation(X, num_clusters) result_objects = [] if StatContainer.ground_truth is not None: df_res = pd.DataFrame() df_res['ground_truth'] = Y df_res['clustering_result'] = kY df_res.pivot_table(index=df_res.columns[0], columns=df_res.columns[1], aggfunc=np.size, fill_value=0) win = Window.window() f = win.gcf() ax = f.add_subplot(111) df_res = DataGuru.convertStrCols_toNumeric(df_res) sns.heatmap(df_res, ax=ax) win.show() if data_frame is not None: result_object = VizContainer.createResult( win, data_frame, ['clstr.fig']) else: result_object = VizContainer.createResult( win, array_datas, ['clstr.fig']) result_objects.append(result_object) result_object = ResultObject(kY, [], DataType.array, CommandStatus.Success) result_object.createName(cname, command_name="clstr", set_keyword_list=True) result_objects.append(result_object) return result_objects