def evaluate(self, alpha_script, parent_parser): """ Run an alfarvis script """ result_object = ResultObject(None, None, None, CommandStatus.Error) if alpha_script.data.data_type is not DataType.alpha_script: Printer.Print("File not of alpha script type: ", alpha_script.data.data_type) return result_object # Get the lines try: lines = [ line.rstrip('\n') for line in open(alpha_script.data.path) ] except: Printer.Print("Alpha script not found") return ResultObject(None, None, None, CommandStatus.Error) # Update parent parser state parent_parser.data.clearCommandSearchResults() for i, line in enumerate(lines): line = line.lstrip() print("Line: ", line) if len(line) == 0: continue elif line[0] == '#': continue # Ignore comments parent_parser.data.parse(line) if parent_parser.data.currentState == ParserStates.command_known_data_unknown: Printer.Print("Ambiguous command at line: ", i) Printer.Print("Exiting script") break parent_parser.data.clearCommandSearchResults() result_object = ResultObject(None, None, None, CommandStatus.Success) return result_object
def evaluate(self, array_datas): """ Create a box plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None or len( StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) df.boxplot(ax=ax) else: ground_truth = StatContainer.ground_truth.name df[ground_truth] = StatContainer.filterGroundTruth() df.dropna(inplace=True) df.boxplot(by=ground_truth, ax=ax) f.suptitle("") win.show() return VizContainer.createResult(win, array_datas, ['box'])
def createResult(self, out, keyword_list): result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) result.createName(keyword_list, command_name='between', set_keyword_list=True) return result
def evaluate(self, image): """ Display the image specified """ try: if image.data_type is DataType.file_name: file_path = image.data.path if not os.path.isfile(file_path): Printer.Print("Cannot find image file: ", file_path) raise RuntimeError curr_image = imread(file_path) result_object = ResultObject( curr_image, image.keyword_list, DataType.image, CommandStatus.Success) else: curr_image = image.data result_object = ResultObject( None, None, None, CommandStatus.Success) image_name = image.keyword_list[0] win = Window.window() plt.imshow(curr_image) plt.gca().axis('off') win.show() Printer.Print("Displaying image" + image_name) except: result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object
def evaluate(self, array_datas): """ Displaying a heatmap for data visualization """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) Printer.Print("Displaying heatmap") win = Window.window() f = win.gcf() if StatContainer.ground_truth is None: sns.clustermap(df, cbar=True, square=False, annot=False, cmap='jet', standard_scale=1) else: gt1 = pd.Series(StatContainer.ground_truth.data) lut = dict(zip(gt1.unique(), "rbg")) row_colors = gt1.map(lut) sns.clustermap(df, standard_scale=1, row_colors=row_colors, cmap="jet") win.show() return VizContainer.createResult(win, array_datas, ['heatmap'])
def evaluate(self, array_datas): """ Create a line plot """ sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True, expand_single=True, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) elif (df.shape[0] == 0 or (df.shape[1] == 1 and np.issubdtype(array_datas[0].data.dtype, np.number) == False)): Printer.Print("No data left to plot after cleaning up!") return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) ax.set_title(cname) df.plot(ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['line'])
def evaluate(self, array_datas): """ Create a violin plot for multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) sns.violinplot(data=df, ax=ax) else: ground_truth = " ".join(StatContainer.ground_truth.keyword_list) df[ground_truth] = StatContainer.filterGroundTruth() df.dropna(inplace=True) df1 = pd.melt(df, id_vars=ground_truth) sns.violinplot(data=df1, ax=ax, x='variable', y='value', hue=ground_truth) win.show() return VizContainer.createResult(win, array_datas, ['violin'])
def read(self, file_path, keyword_list): try: property_data, model_name = self.createProperties(file_path) model = DataGuru.createModel(property_data, model_name) except: Printer.Print("File not found") return ResultObject(None, None, None, CommandStatus.Error) command_status = CommandStatus.Success result_data = [model, property_data, model_name, self.updateModel] result_object = ResultObject(result_data, keyword_list, DataType.algorithm_arg, command_status, add_to_cache=True) result_object.createName(keyword_list) if (PropertyEditor.parent_widget is None or PropertyEditor.property_editor_class is None): Printer.Print("Cannot modify algorithm properties in non-GUI mode") else: property_editor = PropertyEditor.property_editor_class( result_object) PropertyEditor.addPropertyEditor(property_editor) return result_object
def evaluate(self, data_frame, array_datas, classifier_algo, pre_evaluate_results=None): """ Train a classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) if type(pre_evaluate_results) is not list: Printer.Print("Pre evaluation results failed! Attach bug report!") return result_object win = Window.window() if data_frame is not None: result_object = VizContainer.createResult(win, data_frame, ['cval']) elif array_datas is not None: result_object = VizContainer.createResult(win, array_datas, ['cval']) else: Printer.Print("Provide one of data frame or array datas") return result_object cv_output, aux_output = pre_evaluate_results properties, model_data = aux_output.data result_object.data = [win, properties, model_data, self.processkFoldCV] self.printkValueMessage(cv_output.data[0]) self.updateWindow(win, cv_output.data[1], cv_output.data[2], model_data[1], properties["title"]) self.modify_figure.evaluate(result_object) return result_object
def evaluate(self, data_frame, classifier_model): """ Run a trained classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame sns.set(color_codes=True) #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) df = data_frame.data # if command_status == CommandStatus.Error: # return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth to", "to get the prediction accuracy") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.ground_truth.data # Remove nans: df, Y = DataGuru.removenan(df, Y) X = df.values # Get the classifier model trained_model = classifier_model.data model = trained_model['Model'] scaler = trained_model['Scaler'] # Scale the values based on the training standardizer X = scaler.transform(X) # Code to run the classifier # Plot the classification result win = Window.window() f = win.gcf() ax = f.add_subplot(111) Printer.Print('Running the trained classifier...') predictions = model.predict(X) accuracy = metrics.accuracy_score(predictions, Y) Printer.Print("Accuracy : %s" % "{0:.3%}".format(accuracy)) cm = metrics.confusion_matrix(Y, predictions) DataGuru.plot_confusion_matrix(cm, np.unique(Y), ax, title="confusion matrix") win.show() # TODO Need to save the result result_object = ResultObject(None, None, None, CommandStatus.Success) return result_object
def evaluate(self, data_frame, array_datas): """ Run Isomap on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data df = DataGuru.convertStrCols_toNumeric(df) cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() df, Y = DataGuru.removenan(df, Y) # Remove nans: else: df.dropna(inplace=True) # Get the Isomap model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier win = Window.window() properties = self.createDefaultProperties() properties['title'] = cname # return ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: result_object = VizContainer.createResult(win, data_frame, ['ismp']) else: result_object = VizContainer.createResult(win, array_datas, ['ismp']) result_object.data = [win, properties, [X, Y], self.updateFigure] self.updateFigure(result_object.data) self.modify_figure.evaluate(result_object) return result_object
def createResult(self, out, keyword_list, create_name=True): result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) if create_name: result.createName(keyword_list, command_name=self._condition[0], set_keyword_list=True) else: result.keyword_list = keyword_list return result
def evaluate(self, input1): if input1.data_type == DataType.string: print("I received a string") return ResultObject(input1.data, ["overload", "result"], DataType.string) elif input1.data_type == DataType.array: print("I received a number") return ResultObject(input1.data + 1, ["overload", "result"], DataType.array) return ResultObject(None, None, None, CommandStatus.Error)
def evaluate(self, array_data): """ Create a pie plot """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) stTitle = " ".join(array_data.keyword_list) if StatContainer.conditional_array is not None and len( StatContainer.conditional_array.data) == array_data.data.size: inds = StatContainer.conditional_array.data Printer.Print("Nfiltered: ", np.sum(inds)) else: inds = np.full(array_data.data.size, True) col_data = pd.Series(array_data.data[inds], name='array') col_data.dropna(inplace=True) try: uniqVals, inv, counts = np.unique(col_data, return_inverse=True, return_counts=True) except: return ResultObject(None, None, None, CommandStatus.Error) if len(uniqVals) > self.max_unique: if isinstance(uniqVals[0], str): best_idx = np.argpartition(counts, -self.max_unique)[-self.max_unique:] idx = np.isin(inv, best_idx) col_data = col_data[idx] elif np.issubdtype(col_data.dtype, np.number): # Convert to categorical col_data = pd.cut(col_data, 10) uniqVals = True else: uniqVals = None if uniqVals is not None: counts = pd.Series(np.ones(col_data.size), name='count') concat_df = pd.concat([counts, col_data], axis=1) ds = concat_df.groupby(col_data.name).sum()['count'] else: Printer.Print("Too many unique values to plot on a pie chart\n") Printer.Print("Please select another chart type") return result_object win = Window.window() f = win.gcf() ax = f.add_subplot(111) ds.plot.pie(figsize=(8, 8), ax=ax) ax.set_title(stTitle) ax.set_xlabel('') ax.set_aspect('equal') win.show() return VizContainer.createResult(win, array_data, ['pie'])
def evaluate(self, array_datas): """ Create a histogram for multiple variables """ sns.set(color_codes=True) command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True, remove_nan=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) dCol = df[df.columns[0]] try: uniqVals, inv, counts = np.unique(dCol, return_inverse=True, return_counts=True) except: return ResultObject(None, None, None, CommandStatus.Error) if len(uniqVals) > self.max_unique: if isinstance(uniqVals[0], str): best_idx = np.argpartition(counts, -self.max_unique)[-self.max_unique:] idx = np.isin(inv, best_idx) dCol = dCol[idx] else: uniqVals = None if uniqVals is not None and isinstance(uniqVals[0], str): max_len = max([len(uniqVal) for uniqVal in uniqVals]) else: max_len = 0 if (uniqVals is None and not np.issubdtype(dCol.dtype, np.number)): Printer.Print("Too many unique values in non-numeric type data") return ResultObject(None, None, None, CommandStatus.Error) win = Window.window() f = win.gcf() ax = f.add_subplot(111) # TODO Create an argument for setting number of bins if uniqVals is not None: if len(uniqVals) > 5 and max_len > 8: df = dCol.to_frame(name=kl1[0]) sns.countplot(y=kl1[0], data=df, ax=ax) else: df = dCol.to_frame(name=kl1[0]) sns.countplot(x=kl1[0], data=df, ax=ax) elif np.issubdtype(dCol.dtype, np.number): df.plot.hist(stacked=True, ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['histogram', 'hist'])
def read(self, file_path, keyword_list): """ Load the file name specified and store it in history Parameters: file_path file location which is expected to be of type csv keyword_list keywords used to describe the database """ result_object = ResultObject(None, None, None, CommandStatus.Error) skipped_files = 0 mod_file_path = self.findFilePath(file_path) if mod_file_path is not None: # try: data_frame = pd.read_csv(mod_file_path) self.checkHeaders(data_frame.columns.values) result_list = [] for idx, row in data_frame.iterrows(): try: file_type = DataType[row['file_type']] except KeyError: # Depending on verbosity Printer.Print("file type in line ", idx, " not understood in", row['file_name']) Printer.Print("Skipping file ...") skipped_files = skipped_files + 1 continue if file_type == DataType.folder: Printer.Print("Loading folder: ", row['file_name']) read_folder = ReadFolder() result = read_folder.read( row['file_name'], row['keywords'].split(), 'recursive' == row['description']) if result.command_status == CommandStatus.Success: result_list.append(result) else: Printer.Print("Failed to load folder: ", row['file_name']) continue row_file_path = self.findFilePath(row['file_name']) if row_file_path is None: Printer.Print("Cannot find file: ", row['file_name']) continue file_object = FileObject(row_file_path, file_type, row['description'], False) keywords = row['keywords'].split(' ') file_res = ResultObject(file_object, keywords, DataType.file_name) file_res.createName(keywords) result_list.append(file_res) result_object = result_list # except: # result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object
def evaluate(self, array_data, target): split_target = splitPattern(target.data) out = np.array([ self.containsWordList(data, split_target) for data in array_data.data ]) result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) result.createName(array_data.keyword_list, split_target, command_name='contains', set_keyword_list=True) return result
def evaluate(self, array_data): """ Calculate max value of the array and store it to history Parameters: """ result_objects = [] result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) elif numpy.issubdtype(array.dtype, numpy.datetime64): idx = numpy.logical_not(numpy.isnat(array)) else: Printer.Print("The array is not supported type so cannot find max") return result_object if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) max_val = numpy.max(array[idx]) idx = numpy.argmax(array[idx]) if StatContainer.row_labels is not None: rl = StatContainer.row_labels.data max_rl = rl[idx] # Result for max index result_object = ResultObject(max_rl, [], DataType.array, CommandStatus.Success) result_object.createName(StatContainer.row_labels.name, command_name=self.commandTags()[0], set_keyword_list=True) result_objects.append(result_object) # Result for max value result_object = ResultObject(max_val, [], DataType.array, CommandStatus.Success) result_object.createName(array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) result_objects.append(result_object) # Create a dataframe to store the results df_new = pd.DataFrame() df_new['Feature'] = [array_data.name] df_new['Maximum'] = [max_val] if StatContainer.row_labels is not None: df_new[StatContainer.row_labels.name] = [max_rl] #Printer.Print("Maximum of", array_data.name, "is", max_val, "corresponding to", max_rl) # else: #Printer.Print("Maximum of", array_data.name, "is", max_val) TablePrinter.printDataFrame(df_new) return result_objects
def evaluate(self, history, user_conv, name=None): """ Saves the last element from history and saves it with given name """ result_object = ResultObject(None, None, None, CommandStatus.Error) if 'notebook' in user_conv.data or 'chat' in user_conv.data: Printer.save(name) return ResultObject(None, None, None, CommandStatus.Success) elif 'table' in user_conv.data: result = save_table(name, user_conv) if not result: return result_object return ResultObject(None, None, None, CommandStatus.Success) if name is None: return result_object try: previous_result = history.data.getLastObject() name_lower = name.data.lower() keyword_list = name_lower.split(' ') result_object = ResultObject(previous_result.data, keyword_list, history.data.last_data_type, CommandStatus.Success) result_object.createName(keyword_list) Printer.Print("Saving ", ' '.join(previous_result.keyword_list), ' as ', result_object.name) except RuntimeError: Printer.Print("Cannot find last object from history") return result_object
def evaluate(self, array_datas): if not isinstance(array_datas, collections.Iterable): array_datas = [array_datas] N = array_datas[0].data.size out = np.full(N, 'Unknown', dtype='U40') out_filter = np.full(N, False) Printer.Print("Creating a categorical array from: ") for array_data in array_datas: Printer.Print(array_data.name) if array_data.data.size == N: out[array_data.data] = array_data.name out_filter[array_data.data] = True kl1 = [" ".join(array_data.keyword_list) for array_data in array_datas] truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) if common_name == '': common_name_list = array_data[0].keyword_list else: common_name_list = common_name.split(' ') result = ResultObject(out, [], DataType.array, CommandStatus.Success) result.createName(common_name_list, command_name='categorical', set_keyword_list=True) result_filter = ResultObject(out_filter, [], DataType.logical_array, CommandStatus.Success, True) result_filter.createName(common_name_list, command_name='filter', set_keyword_list=True) Printer.Print('Saving categorical array as', result.name) Printer.Print('Saving filter as', result_filter.name) return [result, result_filter]
def evaluate(self, data_frame, classifier_algo): """ Train a classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) # Get the classifier model model = classifier_algo.data[0] # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier Printer.Print("Training the classifier") df_show = pd.DataFrame() df_show['Features'] = df.columns TablePrinter.printDataFrame(df_show) model.fit(X, Y) # Print an update Printer.Print("The classifier", classifier_algo.name, "has been trained") predictions = model.predict(X) accuracy = metrics.accuracy_score(predictions, Y) Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy)) trained_model = {'Scaler': scaler, 'Model': model} result_object = ResultObject(trained_model, [], DataType.trained_model, CommandStatus.Success) classifier_algo_name = classifier_algo.name.replace('.', ' ') result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name, set_keyword_list=True) return result_object
def evaluate(self, array_datas): """ Visualize the relationship between variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) df = pd.DataFrame() for array_data in array_datas: if (np.issubdtype(array_data.data.dtype, np.number)) == True: Printer.Print("The data to plot is not categorical, Please use scatter plot") return result_object df[" ".join(array_data.keyword_list)] = array_data.data df.dropna(inplace=True) df = df.pivot_table( index=df.columns[0], columns=df.columns[1], aggfunc=np.size, fill_value=0) Printer.Print("Displaying heatmap") win = Window.window() f = win.gcf() ax = f.add_subplot(111) sns.heatmap(df, ax=ax) win.show() return VizContainer.createResult(win, array_datas, ['heatmap'])
def createResult(self, window, array_datas, in_keywords): figure = window.gcf() fig_keywords = [] fig_keywords.append('figure') fig_keywords.append(str(figure.number)) fig_keywords = fig_keywords + in_keywords if not isinstance(array_datas, collections.Iterable): array_datas = [array_datas] # TODO Later try adding some room for error like its there in 70% of the arrays common_kl = set.intersection(*[set(array_data.keyword_list) for array_data in array_datas]) fig_keywords = fig_keywords + list(common_kl) result_object = ResultObject(window, fig_keywords, DataType.figure, CommandStatus.Success, add_to_cache=True) result_object.createName(fig_keywords) self.current_figure = figure return result_object
def evaluate(self, array_datas): """ Create a scatter plot between multiple variables """ result_object = ResultObject(None, None, None, CommandStatus.Error) sns.set(color_codes=True) command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return result_object if len(df.columns) <= 1: Printer.Print("There needs to be atleast two variables to perform multiscatter plot!") return result_object win = Window.window() f = win.gcf() ax = f.add_subplot(111) if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]: df.dropna(inplace=True) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax) else: gt1 = pd.Series(StatContainer.filterGroundTruth()) df, gt1 = DataGuru.removenan(df, gt1) lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size))) row_colors = gt1.map(lut) pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax) f.suptitle(cname) win.show() return VizContainer.createResult(win, array_datas, ['multiscatter'])
def read(self, file_path, keyword_list): try: data = imread(file_path) except: return ResultObject(None, None, None, command_status=CommandStatus.Error) win = Window.window() #f = win.gcf() plt.imshow(data) plt.gca().axis('off') win.show() # Initialize image manipulation command group result = ResultObject(data, keyword_list, DataType.image, CommandStatus.Success, add_to_cache=True) result.createName(keyword_list) return result
def evaluate(self, data_frame, target): """ Use one of the models to identify the top predictors """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num = int(numbers[0].data) else: num = 10 # If not specified select top 10 features X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) model = RandomForestClassifier(n_estimators=100) model.fit(X, Y) featImpVals = model.feature_importances_ featimp = pd.Series(featImpVals, index=df.columns).sort_values(ascending=False) df_show = pd.DataFrame() df_show['top features'] = featimp.index[0:num] df_show['feature importance'] = featimp.values[0:num] TablePrinter.printDataFrame(df_show) df_new = df[featimp.index[0:num]] result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) command_name = 'top.predictors' result_object.createName(data_frame.name, command_name=command_name, set_keyword_list=True) return result_object
def evaluate(self, dummy): if dummy is None: res_string = "default string" else: res_string = dummy.data return ResultObject(res_string, ["dummy", "result"], DataType.string, add_to_cache=True)
def evaluate(self): """ Calculate average value of the array and store it to history Parameters: """ StatContainer.conditional_array = None Printer.Print("clearing conditional array") return ResultObject(None, None, None, CommandStatus.Success)
def evaluate(self, array_data): """ Calculate average value of the array and store it to history Parameters: """ StatContainer.conditional_array = array_data Printer.Print("Setting filter to ", array_data.name) return ResultObject(None, None, None, CommandStatus.Success)
def evaluate(self, file_name, pre_evaluate_results=None): """ Load the file name specified and store it in history Parameters: file_name has two entries 1. Path of the file to load 2. Type of the file to load """ result_object = ResultObject(None, None, None, CommandStatus.Error) if file_name.data_type is DataType.figure: Printer.Print("Loading figure ", ' '.join(file_name.keyword_list)) if type(file_name.data) == list: win = file_name.data[0] else: win = file_name.data VizContainer.current_figure = win.gcf() win.show() return ResultObject(None, None, None, CommandStatus.Success) if file_name.data.loaded and file_name.data.data_type is not DataType.algorithm_arg: Printer.Print("File already loaded!") return ResultObject(None, None, None, CommandStatus.Success) if os.path.isfile(file_name.data.path): data_type = file_name.data.data_type if data_type in self.reader_dictionary: reader = self.reader_dictionary[data_type] if reader.read_in_background: result_object = reader.read(file_name.data.path, file_name.keyword_list, pre_evaluate_results) else: result_object = reader.read(file_name.data.path, file_name.keyword_list) Printer.Print("Loaded file: ", os.path.basename(file_name.data.path)) file_name.data.loaded = True else: Printer.Print("We cannot load ", data_type, " yet! Please try again later") else: Printer.Print("File not found.\n Please make sure the file exists " "in the specified location") return result_object