def evaluate(self, array_datas): """ Displaying a heatmap for data visualization """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) Printer.Print("Displaying heatmap") win = Window.window() f = win.gcf() if StatContainer.ground_truth is None: sns.clustermap(df, cbar=True, square=False, annot=False, cmap='jet', standard_scale=1) else: gt1 = pd.Series(StatContainer.ground_truth.data) lut = dict(zip(gt1.unique(), "rbg")) row_colors = gt1.map(lut) sns.clustermap(df, standard_scale=1, row_colors=row_colors, cmap="jet") win.show() return VizContainer.createResult(win, array_datas, ['heatmap'])
def evaluate(self, array_datas): """ Create a scatter plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return result_object if len(df.columns) <= 1: Printer.Print("There needs to be atleast two variables to perform multiscatter plot!") return result_object win = Window.window() f = win.gcf() ax = f.add_subplot(111) if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax) else: gt1 = pd.Series(StatContainer.filterGroundTruth()) df, gt1 = DataGuru.removenan(df, gt1) lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size))) row_colors = gt1.map(lut) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax) f.suptitle(cname) win.show() return VizContainer.createResult(win, array_datas, ['multiscatter'])
def evaluate(self, array_datas): """ Create a violin plot for multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) sns.violinplot(data=df, ax=ax) else: ground_truth = " ".join(StatContainer.ground_truth.keyword_list) df[ground_truth] = StatContainer.filterGroundTruth() df.dropna(inplace=True) df1 = pd.melt(df, id_vars=ground_truth) sns.violinplot(data=df1, ax=ax, x='variable', y='value', hue=ground_truth) win.show() return VizContainer.createResult(win, array_datas, ['violin'])
def evaluate(self, array_datas): """ Create a line plot """ sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True, expand_single=True, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) elif (df.shape[0] == 0 or (df.shape[1] == 1 and np.issubdtype(array_datas[0].data.dtype, np.number) == False)): Printer.Print("No data left to plot after cleaning up!") return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) ax.set_title(cname) df.plot(ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['line'])
def evaluate(self, array_datas): """ Create a box plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None or len( StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) df.boxplot(ax=ax) else: ground_truth = StatContainer.ground_truth.name df[ground_truth] = StatContainer.filterGroundTruth() df.dropna(inplace=True) df.boxplot(by=ground_truth, ax=ax) f.suptitle("") win.show() return VizContainer.createResult(win, array_datas, ['box'])
def evaluate(self, data_frame, array_datas): """ Run Isomap on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data df = DataGuru.convertStrCols_toNumeric(df) cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() df, Y = DataGuru.removenan(df, Y) # Remove nans: else: df.dropna(inplace=True) # Get the Isomap model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier win = Window.window() properties = self.createDefaultProperties() properties['title'] = cname # return ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: result_object = VizContainer.createResult(win, data_frame, ['ismp']) else: result_object = VizContainer.createResult(win, array_datas, ['ismp']) result_object.data = [win, properties, [X, Y], self.updateFigure] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, array_datas): """ Create a histogram for multiple variables """ sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) dCol = df[df.columns[0]] try: uniqVals, inv, counts = np.unique(dCol, return_inverse=True, return_counts=True) except: return ResultObject(None, None, None, CommandStatus.Error) if len(uniqVals) > self.max_unique: if isinstance(uniqVals[0], str): best_idx = np.argpartition(counts, -self.max_unique)[-self.max_unique:] idx = np.isin(inv, best_idx) dCol = dCol[idx] else: uniqVals = None if uniqVals is not None and isinstance(uniqVals[0], str): max_len = max([len(uniqVal) for uniqVal in uniqVals]) else: max_len = 0 if (uniqVals is None and not np.issubdtype(dCol.dtype, np.number)): Printer.Print("Too many unique values in non-numeric type data") return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) # TODO Create an argument for setting number of bins if uniqVals is not None: if len(uniqVals) > 5 and max_len > 8: df = dCol.to_frame(name=kl1[0]) sns.countplot(y=kl1[0], data=df, ax=ax) else: df = dCol.to_frame(name=kl1[0]) sns.countplot(x=kl1[0], data=df, ax=ax) elif np.issubdtype(dCol.dtype, np.number): df.plot.hist(stacked=True, ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['histogram', 'hist'])
def evaluate(self, array_datas, data_frame): """ Calculate label-wise mean array store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if len(cname) == 0: cname = ".".join(kl1) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) df_new = self.performOperation(df) TablePrinter.printDataFrame(df_new) result_objects = [] # Adding the newly created CSV result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) command_name = "smry" result_object.createName(cname, command_name=command_name, set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def preEvaluate(self, data_frame, array_datas, classifier_algo): result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame sns.set(color_codes=True) if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: print("Error in getting dataframe!") result_object.data = "Error in getting dataframe!" return result_object else: result_object.data = "Please provide data frame or arrays to analyze" return result_object # Get the ground truth array if StatContainer.ground_truth is None: result_object.data = ("Please set a feature vector to ground truth by" + "typing set ground truth before using this command") return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.ground_truth.data # Remove nans: df, Y = DataGuru.removenan(df, Y) # Get the classifier model model = classifier_algo.data[0] # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) properties = self.createDefaultProperties() properties['title'] = cname cv_output = self.performCV(properties, X, Y, model) aux_output = (properties, [X, Y, model]) return [ResultObject(cv_output, None), ResultObject(aux_output, None)]
def evaluate(self, array_datas): """ Create a scatter plot between two variables """ sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print("please try the following command:", "Visualize comparison between...") return ResultObject(None, None, None, CommandStatus.Error) properties = self.createDefaultProperties() properties['title'] = cname win = Window.window() row_colors = None if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) if df.shape[0] == 0: return ResultObject(None, None, None, CommandStatus.Error) array = df.values else: gt1 = pd.Series(StatContainer.filterGroundTruth()) df, gt1 = DataGuru.removenan(df, gt1) if df.shape[0] == 0: return ResultObject(None, None, None, CommandStatus.Error) lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size))) row_colors = gt1.map(lut) array = df.values result_object = VizContainer.createResult( win, array_datas, ['scatter2d']) result_object.data = [win, properties, [ array, row_colors, kl1], self.updateFigure] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, array_datas): """ Find the correlation between two or more variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) if len(array_datas) < 2: Printer.Print("Need atleast two arrays to compute correlation") return ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) corr_res = df.corr() if len(array_datas) == 2: Printer.Print("The correlation between ", kl1[0], " and ", kl1[1], " is ", str(corr_res.values[0][1])) Printer.Print("Displaying the result as a heatmap") win = Window.window() f = win.gcf() ax = f.add_subplot(111) sns.heatmap(corr_res, cbar=True, square=True, annot=True, fmt='.2f', annot_kws={'size': 15}, xticklabels=df.columns, yticklabels=df.columns, cmap='jet', ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['correlation'])
def evaluate(self, array_datas): """ Create a scatter plot between two variables """ command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print("please try the following command:", "subtract a from b") return ResultObject(None, None, None, CommandStatus.Error) df_array = df.as_matrix() try: out = df_array[:, 1] - df_array[:, 0] except: return ResultObject(None, None, None, CommandStatus.Error) result_object = ResultObject(out, [], DataType.array, CommandStatus.Success) result_object.createName(array_datas[0].keyword_list, array_datas[1].keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) return result_object
def evaluate(self, array_datas): """ Create a a new dataframe using the supplied arrays """ command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print( "Please check whether the arrays are of the same size") return ResultObject(None, None, None, CommandStatus.Error) result_object = ResultObject(df, [], DataType.csv, CommandStatus.Success) command_name = 'concatenate.array' result_object.createName(cname, command_name=command_name, set_keyword_list=True) TablePrinter.printDataFrame(df) return result_object
def evaluate(self, array_datas, data_frame): """ Calculate ttest of the array and store it to history Parameters: """ if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: print("Could not find the reference variable.") print("Please set the reference variable") return ResultObject(None, None, None, CommandStatus.Error) else: gtVals = StatContainer.filterGroundTruth() ground_truth = StatContainer.ground_truth.name if len(gtVals) != df.shape[0]: print( "The size of the ground truth does not match with arrays being analyzed" ) print(len(gtVals), df.shape[0]) return ResultObject(None, None, None, CommandStatus.Error) uniqVals = StatContainer.isCategorical(gtVals) df[ground_truth] = gtVals df_new = pd.DataFrame() if ground_truth in df.columns: df_new['features'] = df.columns.drop(ground_truth).values else: df_new['features'] = df.columns allCols = df_new['features'] for iter in range(len(uniqVals)): for iter1 in range(iter + 1, len(uniqVals)): df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)] = np.zeros(df_new.shape[0]) for iter_feature in range(len(df_new['features'])): arr = df[allCols[iter_feature]] for iter in range(len(uniqVals)): uniV = uniqVals[iter] a = arr[gtVals == uniV] for iter1 in range(iter + 1, len(uniqVals)): b = arr[gtVals == uniqVals[iter1]] if uniV != uniqVals[iter1]: ttest_val = scipy.stats.ttest_ind(a, b, axis=0, equal_var=False) df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)][iter_feature] = (ttest_val.pvalue) else: df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)][iter_feature] = 0 TablePrinter.printDataFrame(df_new) result_objects = [] # Adding the newly created csv result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) result_object.createName(cname, command_name='sigtest', set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'sigtest' result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def evaluate(self, array_datas): """ Create a bar plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: gtVals = np.ones(df.shape[0]) ground_truth = 'ground_truth' else: gtVals = StatContainer.filterGroundTruth() ground_truth = StatContainer.ground_truth.name if len(gtVals) != df.shape[0]: print("ground truth does not match with df shape") print(len(gtVals), df.shape[0]) gtVals = np.ones(df.shape[0]) ground_truth = 'ground_truth' # Remove nans: df[ground_truth] = gtVals df.dropna(inplace=True) gtVals = df[ground_truth] uniqVals = StatContainer.isCategorical(gtVals) binned_ground_truth = False if uniqVals is None and np.issubdtype(gtVals.dtype, np.number): # Convert to categorical df[ground_truth] = pd.cut(gtVals, 10) binned_ground_truth = True if binned_ground_truth is True or uniqVals is not None: gb = df.groupby(ground_truth) df_mean = gb.mean() df_errors = gb.std() if uniqVals is not None and isinstance(uniqVals[0], str): truncated_uniqVals, _ = StatContainer.removeCommonNames( df_mean.index) df_mean.index = truncated_uniqVals df_errors.index = truncated_uniqVals # Number of uniq_vals x number of arrs df_mean_shape = df_mean.shape if (not binned_ground_truth and df_mean_shape[1] >= df_mean_shape[0]): df_mean = df_mean.T df_errors = df_errors.T else: Printer.Print("Ground truth could not be mapped to", "categorical array\n") Printer.Print("Please clear or select appropriate ground truth") return result_object properties = self.createDefaultProperties() properties['title'] = cname if uniqVals is not None and isinstance(uniqVals[0], str): max_len = max([len(uniqVal) for uniqVal in uniqVals]) else: max_len = 0 if (binned_ground_truth or (uniqVals is not None and len(uniqVals) > 5 and max_len > 8)): properties["horizontal"] = True if binned_ground_truth: properties["overwrite_labels"] = True properties["ylabel"] = StatContainer.ground_truth.name win = Window.window() result_object = VizContainer.createResult(win, array_datas, ['bar']) result_object.data = [ win, properties, [df_mean, df_errors], self.updateFigure ] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, array_datas, data_frame): """ Calculate ROC of the array and store it to history Parameters: """ if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: Printer.Print("Could not find the reference variable.") Printer.Print("Please set the reference variable") return ResultObject(None, None, None, CommandStatus.Error) else: gtVals = StatContainer.filterGroundTruth() ground_truth = StatContainer.ground_truth.name if len(gtVals) != df.shape[0]: Printer.Print( "The size of the ground truth does not match with arrays being analyzed" ) Printer.Print(len(gtVals), df.shape[0]) return ResultObject(None, None, None, CommandStatus.Error) uniqVals = StatContainer.isCategorical(gtVals) df[ground_truth] = gtVals df_new = pd.DataFrame() if ground_truth in df.columns: df_new['features'] = df.columns.drop(ground_truth).values else: df_new['features'] = df.columns allCols = df_new['features'] for iter in range(len(uniqVals)): for iter1 in range(iter + 1, len(uniqVals)): df_new['AUC'] = 0 avgAUC = [] for iter_feature in range(len(df_new['features'])): arr = df[allCols[iter_feature]] model = LogisticRegression() X = arr.values X1 = X.reshape(-1, 1) model.fit(X1, gtVals) # evaluate the model allAUC = [] Y_Pr = model.predict_proba(X1) for iter in range(len(uniqVals)): fpr, tpr, thresholds = metrics.roc_curve( gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter]) fpr, tpr, thresholds = metrics.roc_curve( gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter]) auc_val = metrics.auc(fpr, tpr) allAUC.append(auc_val) avgAUC.append(np.mean(allAUC)) df_new['AUC'] = avgAUC TablePrinter.printDataFrame(df_new) # New data frame result_objects = [] result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) result_object.createName(cname, command_name='rcurve', set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'rcurve' result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def evaluate(self, array_datas): """ Calculate label-wise mean array store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) if isinstance(array_datas, list) and len(array_datas) == 0: return result_object command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: gtVals = np.ones(df.shape[0]) gtName = 'ground_truth' else: gtVals = StatContainer.filterGroundTruth() gtName = StatContainer.ground_truth.name # Remove nans: df[gtName] = gtVals df.dropna(inplace=True) gtVals = df[gtName] uniqVals = StatContainer.isCategorical(gtVals, uniqueCutoff=1000) binned_ground_truth = True if uniqVals is None and np.issubdtype(gtVals.dtype, np.number): # Convert to categorical df[gtName] = pd.cut(gtVals, 10) binned_ground_truth = True # Create groupwise arrays result_objects = [] if uniqVals is not None: df_new = self.performOperation(df, gtName) df_new = df_new.reset_index() for col in df_new.columns: arr = df_new[col] kName = [] if col == '': kName = array_datas[0].keyword_list else: # kName.append(cname) kName.append(col) result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'labelwise.' + self._condition[0] result_object.createName(kName, command_name=command_name, set_keyword_list=True) result_objects.append(result_object) TablePrinter.printDataFrame(df_new) else: Printer.Print("The array is not of numeric type so cannot", "calculate groupwise " + self._condition[0]) result_objects.append(result_object) return result_objects
def evaluate(self, data_frame, array_datas, target): """ Run clustering on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) else: df.dropna(inplace=True) # Get the tsne model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num_clusters = int(numbers[0].data) else: num_clusters = 2 # If not specified use 2 clusters kY = self.performOperation(X, num_clusters) result_objects = [] if StatContainer.ground_truth is not None: df_res = pd.DataFrame() df_res['ground_truth'] = Y df_res['clustering_result'] = kY df_res.pivot_table(index=df_res.columns[0], columns=df_res.columns[1], aggfunc=np.size, fill_value=0) win = Window.window() f = win.gcf() ax = f.add_subplot(111) df_res = DataGuru.convertStrCols_toNumeric(df_res) sns.heatmap(df_res, ax=ax) win.show() if data_frame is not None: result_object = VizContainer.createResult( win, data_frame, ['clstr.fig']) else: result_object = VizContainer.createResult( win, array_datas, ['clstr.fig']) result_objects.append(result_object) result_object = ResultObject(kY, [], DataType.array, CommandStatus.Success) result_object.createName(cname, command_name="clstr", set_keyword_list=True) result_objects.append(result_object) return result_objects
def evaluate(self, data_frame, array_datas): """ Run pca on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data df = DataGuru.convertStrCols_toNumeric(df) cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) else: df.dropna(inplace=True) # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier pca = PCA(n_components=2) pca_res = pca.fit_transform(X) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if Y is None: sc = ax.scatter(pca_res[:, 0], pca_res[:, 1], cmap="jet", edgecolor="None", alpha=0.35) else: sc = ax.scatter(pca_res[:, 0], pca_res[:, 1], c=Y, cmap="jet", edgecolor="None", alpha=0.35) cbar = plt.colorbar(sc) cbar.ax.get_yaxis().labelpad = 15 cbar.ax.set_ylabel(StatContainer.ground_truth.name, rotation=270) ax.set_title(cname) win.show() # return ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: return VizContainer.createResult(win, data_frame, ['pca']) else: return VizContainer.createResult(win, array_datas, ['pca'])