def evaluate(self, array_data, user_conv): try: if isinstance(array_data.data[0], str): date_time = pd.to_datetime(array_data.data, infer_datetime_format=True) array_data.data = date_time else: date_time = array_data.data if not isinstance(array_data.data[0], pd.datetime): raise RuntimeError() except: Printer.Print("Cannot transform data to date time") return ResultObject(None, None, None, CommandStatus.Error) results = [] for word in ['day', 'year', 'month', 'hour', 'minute']: if word in user_conv.data or word + 's' in user_conv.data: out = getattr(date_time, word) result = ResultObject(out, [], DataType.array) result.createName(array_data.keyword_list, command_name=word, set_keyword_list=True) results.append(result) Printer.Print('Saving ', word, 'from ', array_data.name, ' as', result.name) if results != []: return results return ResultObject(None, None, None, CommandStatus.Success)
def evaluate(self, history, user_conv, name=None): """ Saves the last element from history and saves it with given name """ result_object = ResultObject(None, None, None, CommandStatus.Error) if 'notebook' in user_conv.data or 'chat' in user_conv.data: Printer.save(name) return ResultObject(None, None, None, CommandStatus.Success) elif 'table' in user_conv.data: result = save_table(name, user_conv) if not result: return result_object return ResultObject(None, None, None, CommandStatus.Success) if name is None: return result_object try: previous_result = history.data.getLastObject() name_lower = name.data.lower() keyword_list = name_lower.split(' ') result_object = ResultObject(previous_result.data, keyword_list, history.data.last_data_type, CommandStatus.Success) result_object.createName(keyword_list) Printer.Print("Saving ", ' '.join(previous_result.keyword_list), ' as ', result_object.name) except RuntimeError: Printer.Print("Cannot find last object from history") return result_object
def evaluate(self, csv_data): """ Transform a csv to its standardized counterpart """ result_object = ResultObject(None, None, None, CommandStatus.Error) data = csv_data.data.copy() # if numpy.issubdtype(data.dtype, numpy.number): for column in data.columns: col_data_drop = data[column].dropna() uniqVals = StatContainer.isCategorical(col_data_drop) if (uniqVals is None and len(col_data_drop) > 0 and isinstance(col_data_drop.iloc[0], str) == False): data[column] = ((data[column] - numpy.mean(col_data_drop)) / numpy.std(col_data_drop)) Printer.Print("Saving the scaled data...") result_object = ResultObject(data, [], DataType.csv, CommandStatus.Success) result_object.createName(csv_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) return result_object
def createResult(self, out, keyword_list): result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) result.createName(keyword_list, command_name='between', set_keyword_list=True) return result
def evaluate(self, array_data): """ Calculate sum of all elements of the array and store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) mean_val = numpy.sum(array[idx]) result_object = ResultObject(mean_val, [], DataType.array, CommandStatus.Success) result_object.createName(array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) Printer.Print("Sum of", array_data.name, "is", mean_val) else: Printer.Print("The array is not of numeric type so cannot", "take sum") return result_object
def evaluate(self, array_data): N = len(array_data) if N < 1: return ResultObject(None, None, None, CommandStatus.Error) out = array_data[0].data Printer.Print("Performing logical", self._add_tags[0], "on ") Printer.Print(array_data[0].name) if self._operator == '!': out = np.logical_not(array_data[0].data) for arr_data in array_data[1:]: Printer.Print(", ", arr_data.name) if self._operator == '&': out = np.logical_and(out, arr_data.data) elif self._operator == '||': out = np.logical_or(out, arr_data.data) elif self._operator == '^': out = np.logical_xor(out, arr_data.data) else: return ResultObject(None, None, None, CommandStatus.Error) Printer.Print(arr_data.name) if StatContainer.conditional_array is not None: non_filt_idx = np.logical_not(StatContainer.conditional_array) out[non_filt_idx] = False result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) if len(array_data) > 1: keyword_list2 = array_data[1].keyword_list else: keyword_list2 = [] result.createName(array_data[0].keyword_list, keyword_list2, command_name=self._add_tags[0], set_keyword_list=True) return result
def read(self, file_path, keyword_list): try: property_data, model_name = self.createProperties(file_path) model = DataGuru.createModel(property_data, model_name) except: Printer.Print("File not found") return ResultObject(None, None, None, CommandStatus.Error) command_status = CommandStatus.Success result_data = [model, property_data, model_name, self.updateModel] result_object = ResultObject(result_data, keyword_list, DataType.algorithm_arg, command_status, add_to_cache=True) result_object.createName(keyword_list) if (PropertyEditor.parent_widget is None or PropertyEditor.property_editor_class is None): Printer.Print("Cannot modify algorithm properties in non-GUI mode") else: property_editor = PropertyEditor.property_editor_class( result_object) PropertyEditor.addPropertyEditor(property_editor) return result_object
def evaluate(self, array_data): """ Calculate stdev value of the array and store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) std_val = numpy.std(array[idx]) result_object = ResultObject( std_val, [], DataType.array, CommandStatus.Success) result_object.createName( array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) df_new = pd.DataFrame() df_new['Feature'] = [array_data.name] df_new['Standard Deviation'] = [std_val] TablePrinter.printDataFrame(df_new) # Printer.Print("Standard deviation of", array_data.name, # "is", std_val) else: Printer.Print("The array is not of numeric type so cannot", "find stdev") return result_object
def evaluate(self, array_datas): if not isinstance(array_datas, collections.Iterable): array_datas = [array_datas] N = array_datas[0].data.size out = np.full(N, 'Unknown', dtype='U40') out_filter = np.full(N, False) Printer.Print("Creating a categorical array from: ") for array_data in array_datas: Printer.Print(array_data.name) if array_data.data.size == N: out[array_data.data] = array_data.name out_filter[array_data.data] = True kl1 = [" ".join(array_data.keyword_list) for array_data in array_datas] truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) if common_name == '': common_name_list = array_data[0].keyword_list else: common_name_list = common_name.split(' ') result = ResultObject(out, [], DataType.array, CommandStatus.Success) result.createName(common_name_list, command_name='categorical', set_keyword_list=True) result_filter = ResultObject(out_filter, [], DataType.logical_array, CommandStatus.Success, True) result_filter.createName(common_name_list, command_name='filter', set_keyword_list=True) Printer.Print('Saving categorical array as', result.name) Printer.Print('Saving filter as', result_filter.name) return [result, result_filter]
def evaluate(self, array_data): """ Calculate count (number of values) of an array and store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if StatContainer.conditional_array is not None: array = array[StatContainer.conditional_array.data] nan_idx = StatContainer.getNanIdx(array) if numpy.issubdtype(array.dtype, numpy.number): array_filtered = array[numpy.logical_not(nan_idx)] count_val = numpy.count_nonzero(array_filtered) result_object = ResultObject(count_val, [], DataType.array, CommandStatus.Success) result_object.createName( array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) Printer.Print("Count of", array_data.name, "is", count_val) else: Printer.Print("The array is not of numeric type so cannot", "find count") return result_object
def evaluate(self, data_frame, classifier_algo): """ Train a classifier on multiple arrays """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) # Get the classifier model model = classifier_algo.data[0] # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier Printer.Print("Training the classifier") df_show = pd.DataFrame() df_show['Features'] = df.columns TablePrinter.printDataFrame(df_show) model.fit(X, Y) # Print an update Printer.Print("The classifier", classifier_algo.name, "has been trained") predictions = model.predict(X) accuracy = metrics.accuracy_score(predictions, Y) Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy)) trained_model = {'Scaler': scaler, 'Model': model} result_object = ResultObject(trained_model, [], DataType.trained_model, CommandStatus.Success) classifier_algo_name = classifier_algo.name.replace('.', ' ') result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name, set_keyword_list=True) return result_object
def evaluate(self, data_frame, target): """ Use one of the models to identify the top predictors """ result_object = ResultObject(None, None, None, CommandStatus.Error) # Get the data frame df = data_frame.data #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas) if StatContainer.ground_truth is None: Printer.Print("Please set a feature vector to ground truth by", "typing set ground truth before using this command") result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object else: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num = int(numbers[0].data) else: num = 10 # If not specified select top 10 features X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) model = RandomForestClassifier(n_estimators=100) model.fit(X, Y) featImpVals = model.feature_importances_ featimp = pd.Series(featImpVals, index=df.columns).sort_values(ascending=False) df_show = pd.DataFrame() df_show['top features'] = featimp.index[0:num] df_show['feature importance'] = featimp.values[0:num] TablePrinter.printDataFrame(df_show) df_new = df[featimp.index[0:num]] result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) command_name = 'top.predictors' result_object.createName(data_frame.name, command_name=command_name, set_keyword_list=True) return result_object
def createResult(self, out, keyword_list, create_name=True): result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) if create_name: result.createName(keyword_list, command_name=self._condition[0], set_keyword_list=True) else: result.keyword_list = keyword_list return result
def read(self, file_path, keyword_list): """ Load the file name specified and store it in history Parameters: file_path file location which is expected to be of type csv keyword_list keywords used to describe the database """ result_object = ResultObject(None, None, None, CommandStatus.Error) skipped_files = 0 mod_file_path = self.findFilePath(file_path) if mod_file_path is not None: # try: data_frame = pd.read_csv(mod_file_path) self.checkHeaders(data_frame.columns.values) result_list = [] for idx, row in data_frame.iterrows(): try: file_type = DataType[row['file_type']] except KeyError: # Depending on verbosity Printer.Print("file type in line ", idx, " not understood in", row['file_name']) Printer.Print("Skipping file ...") skipped_files = skipped_files + 1 continue if file_type == DataType.folder: Printer.Print("Loading folder: ", row['file_name']) read_folder = ReadFolder() result = read_folder.read( row['file_name'], row['keywords'].split(), 'recursive' == row['description']) if result.command_status == CommandStatus.Success: result_list.append(result) else: Printer.Print("Failed to load folder: ", row['file_name']) continue row_file_path = self.findFilePath(row['file_name']) if row_file_path is None: Printer.Print("Cannot find file: ", row['file_name']) continue file_object = FileObject(row_file_path, file_type, row['description'], False) keywords = row['keywords'].split(' ') file_res = ResultObject(file_object, keywords, DataType.file_name) file_res.createName(keywords) result_list.append(file_res) result_object = result_list # except: # result_object = ResultObject(None, None, None, CommandStatus.Error) return result_object
def evaluate(self, array_data, target): split_target = splitPattern(target.data) out = np.array([ self.containsWordList(data, split_target) for data in array_data.data ]) result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) result.createName(array_data.keyword_list, split_target, command_name='contains', set_keyword_list=True) return result
def evaluate(self, array_data): """ Calculate max value of the array and store it to history Parameters: """ result_objects = [] result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) elif numpy.issubdtype(array.dtype, numpy.datetime64): idx = numpy.logical_not(numpy.isnat(array)) else: Printer.Print("The array is not supported type so cannot find max") return result_object if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) max_val = numpy.max(array[idx]) idx = numpy.argmax(array[idx]) if StatContainer.row_labels is not None: rl = StatContainer.row_labels.data max_rl = rl[idx] # Result for max index result_object = ResultObject(max_rl, [], DataType.array, CommandStatus.Success) result_object.createName(StatContainer.row_labels.name, command_name=self.commandTags()[0], set_keyword_list=True) result_objects.append(result_object) # Result for max value result_object = ResultObject(max_val, [], DataType.array, CommandStatus.Success) result_object.createName(array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) result_objects.append(result_object) # Create a dataframe to store the results df_new = pd.DataFrame() df_new['Feature'] = [array_data.name] df_new['Maximum'] = [max_val] if StatContainer.row_labels is not None: df_new[StatContainer.row_labels.name] = [max_rl] #Printer.Print("Maximum of", array_data.name, "is", max_val, "corresponding to", max_rl) # else: #Printer.Print("Maximum of", array_data.name, "is", max_val) TablePrinter.printDataFrame(df_new) return result_objects
def evaluate(self, array_datas, data_frame): """ Calculate label-wise mean array store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Success) if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if len(cname) == 0: cname = ".".join(kl1) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) df_new = self.performOperation(df) TablePrinter.printDataFrame(df_new) result_objects = [] # Adding the newly created CSV result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) command_name = "smry" result_object.createName(cname, command_name=command_name, set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def createResult(self, window, array_datas, in_keywords): figure = window.gcf() fig_keywords = [] fig_keywords.append('figure') fig_keywords.append(str(figure.number)) fig_keywords = fig_keywords + in_keywords if not isinstance(array_datas, collections.Iterable): array_datas = [array_datas] # TODO Later try adding some room for error like its there in 70% of the arrays common_kl = set.intersection(*[set(array_data.keyword_list) for array_data in array_datas]) fig_keywords = fig_keywords + list(common_kl) result_object = ResultObject(window, fig_keywords, DataType.figure, CommandStatus.Success, add_to_cache=True) result_object.createName(fig_keywords) self.current_figure = figure return result_object
def read(self, file_path, keyword_list): try: data = imread(file_path) except: return ResultObject(None, None, None, command_status=CommandStatus.Error) win = Window.window() #f = win.gcf() plt.imshow(data) plt.gca().axis('off') win.show() # Initialize image manipulation command group result = ResultObject(data, keyword_list, DataType.image, CommandStatus.Success, add_to_cache=True) result.createName(keyword_list) return result
def add_categories_as_columns(self, uniqVals, col_data, col_split, result_objects, command_status): """ Module to convert a categorical column into a bunch of logical arrays """ for uniV in uniqVals: categ_data = col_data == uniV categ_name = str(uniV) category_split = [ key_val.lower() for key_val in splitPattern(categ_name) ] category_keyword_list = category_split + col_split result_object = ResultObject(categ_data * 1, category_keyword_list, DataType.logical_array, command_status) result_object.createName(category_keyword_list) result_objects.append(result_object) return result_objects
def evaluate(self, array_data): """ Calculate range value of the array and store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) elif numpy.issubdtype(array.dtype, numpy.datetime64): idx = numpy.logical_not(numpy.isnat(array)) else: Printer.Print("The array is not supported type so cannot find max") return result_object if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) max_val = numpy.max(array[idx]) min_val = numpy.min(array[idx]) range_val = max_val - min_val result_object = ResultObject(range_val, [], DataType.array, CommandStatus.Success) result_object.createName(array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) df_new = pd.DataFrame() df_new['Feature'] = [array_data.name] df_new['Range'] = [range_val] df_new['Minimum'] = [min_val] df_new['Maximum'] = [max_val] TablePrinter.printDataFrame(df_new) # Printer.Print("Range of", array_data.name, "is", range_val, # "from", min_val, "to", max_val) return result_object
def evaluate(self, array_datas): """ Create a a new dataframe using the supplied arrays """ command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print( "Please check whether the arrays are of the same size") return ResultObject(None, None, None, CommandStatus.Error) result_object = ResultObject(df, [], DataType.csv, CommandStatus.Success) command_name = 'concatenate.array' result_object.createName(cname, command_name=command_name, set_keyword_list=True) TablePrinter.printDataFrame(df) return result_object
def evaluate(self, array_datas): """ Create a scatter plot between two variables """ command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: Printer.Print("please try the following command:", "subtract a from b") return ResultObject(None, None, None, CommandStatus.Error) df_array = df.as_matrix() try: out = df_array[:, 1] - df_array[:, 0] except: return ResultObject(None, None, None, CommandStatus.Error) result_object = ResultObject(out, [], DataType.array, CommandStatus.Success) result_object.createName(array_datas[0].keyword_list, array_datas[1].keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) return result_object
def read(self, file_path, keyword_list, recursive=False, folder_database=None): """ Load the file name specified and store it in history Parameters: file_path folder location keyword_list keywords used to describe the folder """ result_object = ResultObject(None, None, None, CommandStatus.Error) if folder_database is None: folder_database = TypeDatabase(data_type_list=[DataType.csv, DataType.image]) create_result = True else: create_result = False if len(keyword_list) == 0: keyword_list = splitPattern(file_path) if not os.path.isdir(file_path): file_path = os.path.join(os.path.expanduser('~'), file_path) if not os.path.isdir(file_path): print("Cannot find folder: ", file_path) return result_object for dir_entry in os.scandir(file_path): if self.checkEndsWith(dir_entry.name, ['.csv', '.xlsx']) and dir_entry.is_file(): self.addFile(dir_entry, DataType.csv, folder_database, file_path) elif self.checkEndsWith(dir_entry.name, ['.png', '.jpg', '.JPG', '.jpeg']) and dir_entry.is_file(): self.addFile(dir_entry, DataType.image, folder_database, file_path) if recursive and dir_entry.is_dir(): dir_keywords = splitPattern(dir_entry.name) self.read(os.path.join(file_path, dir_entry.name), keyword_list + dir_keywords, True, folder_database) if not create_result: return False folder_object = FolderObject(folder_database, file_path) result_object = ResultObject(folder_object, keyword_list, DataType.folder, CommandStatus.Success) result_object.createName(keyword_list) return result_object
def evaluate(self, array_datas, data_frame): """ Calculate ttest of the array and store it to history Parameters: """ if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: print("Could not find the reference variable.") print("Please set the reference variable") return ResultObject(None, None, None, CommandStatus.Error) else: gtVals = StatContainer.filterGroundTruth() ground_truth = StatContainer.ground_truth.name if len(gtVals) != df.shape[0]: print( "The size of the ground truth does not match with arrays being analyzed" ) print(len(gtVals), df.shape[0]) return ResultObject(None, None, None, CommandStatus.Error) uniqVals = StatContainer.isCategorical(gtVals) df[ground_truth] = gtVals df_new = pd.DataFrame() if ground_truth in df.columns: df_new['features'] = df.columns.drop(ground_truth).values else: df_new['features'] = df.columns allCols = df_new['features'] for iter in range(len(uniqVals)): for iter1 in range(iter + 1, len(uniqVals)): df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)] = np.zeros(df_new.shape[0]) for iter_feature in range(len(df_new['features'])): arr = df[allCols[iter_feature]] for iter in range(len(uniqVals)): uniV = uniqVals[iter] a = arr[gtVals == uniV] for iter1 in range(iter + 1, len(uniqVals)): b = arr[gtVals == uniqVals[iter1]] if uniV != uniqVals[iter1]: ttest_val = scipy.stats.ttest_ind(a, b, axis=0, equal_var=False) df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)][iter_feature] = (ttest_val.pvalue) else: df_new['pValue: ' + str(iter) + ' vs ' + str(iter1)][iter_feature] = 0 TablePrinter.printDataFrame(df_new) result_objects = [] # Adding the newly created csv result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) result_object.createName(cname, command_name='sigtest', set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'sigtest' result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def preRead(self, file_path, keyword_list): command_status = CommandStatus.Success try: data = pd.read_csv(file_path) except: try: data = pd.read_excel(file_path) except: return ResultObject("File not found", None, None, CommandStatus.Error) result_objects = [] result_object = ResultObject(data, keyword_list, DataType.csv, command_status, add_to_cache=True) result_object.createName(result_object.keyword_list) result_objects.append(result_object) # Too many columns do not extract them individually if len(data.columns) > 5000: return result_objects new_column_names = [] # num_unique = float("inf") # Used for smallest unique vec finding #current_gt = None for column in data.columns: if self.col_head_pattern.match(column): data.drop(column, axis=1, inplace=True) continue else: col_split = splitPattern(column) col_data = data[column].values col_keyword_list = col_split N = col_data.size if N == 0: continue if isinstance(col_data[0], str): if '%' in col_data[0]: try: col_data = data[column].str.rstrip('%').astype( float, copy=False) data[column] = col_data if 'percent' not in col_keyword_list: col_keyword_list.append('percent') except ValueError: pass elif '$' in col_data[0] or ',' in col_data[0]: try: col_data = data[column].str.translate( self.currency_dict).astype(float, copy=False) data[column] = col_data if '$' not in col_keyword_list: col_keyword_list.append('$') except ValueError: pass result_object = ResultObject(col_data, col_keyword_list, DataType.array, command_status, add_to_cache=True) result_object.createName(col_keyword_list) new_column_names.append(result_object.name) result_objects.append(result_object) # For now removing unique value search which is pretty slow #unique_vals = StatContainer.isCategorical(col_data) # if unique_vals is not None: # if len(unique_vals) < num_unique: # current_gt = result_object # num_unique = len(unique_vals) # # Do not add unique values as columns unless they are only a # # few # # if len(unique_vals) < 5: # # result_objects = self.add_categories_as_columns( # # unique_vals, col_data, col_split, # # result_objects, command_status) # Replace columns: data.columns = new_column_names # if current_gt is not None: # StatContainer.ground_truth = current_gt return result_objects
def evaluate(self, array_datas, data_frame): """ Calculate ROC of the array and store it to history Parameters: """ if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: Printer.Print("Could not find the reference variable.") Printer.Print("Please set the reference variable") return ResultObject(None, None, None, CommandStatus.Error) else: gtVals = StatContainer.filterGroundTruth() ground_truth = StatContainer.ground_truth.name if len(gtVals) != df.shape[0]: Printer.Print( "The size of the ground truth does not match with arrays being analyzed" ) Printer.Print(len(gtVals), df.shape[0]) return ResultObject(None, None, None, CommandStatus.Error) uniqVals = StatContainer.isCategorical(gtVals) df[ground_truth] = gtVals df_new = pd.DataFrame() if ground_truth in df.columns: df_new['features'] = df.columns.drop(ground_truth).values else: df_new['features'] = df.columns allCols = df_new['features'] for iter in range(len(uniqVals)): for iter1 in range(iter + 1, len(uniqVals)): df_new['AUC'] = 0 avgAUC = [] for iter_feature in range(len(df_new['features'])): arr = df[allCols[iter_feature]] model = LogisticRegression() X = arr.values X1 = X.reshape(-1, 1) model.fit(X1, gtVals) # evaluate the model allAUC = [] Y_Pr = model.predict_proba(X1) for iter in range(len(uniqVals)): fpr, tpr, thresholds = metrics.roc_curve( gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter]) fpr, tpr, thresholds = metrics.roc_curve( gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter]) auc_val = metrics.auc(fpr, tpr) allAUC.append(auc_val) avgAUC.append(np.mean(allAUC)) df_new['AUC'] = avgAUC TablePrinter.printDataFrame(df_new) # New data frame result_objects = [] result_object = ResultObject(df_new, [], DataType.csv, CommandStatus.Success) result_object.createName(cname, command_name='rcurve', set_keyword_list=True) result_objects.append(result_object) # create an updated list of column names by removing the common names kl1 = df_new.columns truncated_kl1, common_name = StatContainer.removeCommonNames(kl1) for col in range(0, len(kl1)): arr = df_new[kl1[col]] result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'rcurve' result_object.createName(truncated_kl1[col], command_name=command_name, set_keyword_list=True) result_objects.append(result_object) return result_objects
def evaluate(self, data_frame, array_datas, target): """ Run clustering on a dataset of multiple arrays """ # Get the data frame if data_frame is not None: df = data_frame.data cname = data_frame.name elif array_datas is not None: command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas, useCategorical=True) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) else: Printer.Print("Please provide data frame or arrays to analyze") return ResultObject(None, None, None, CommandStatus.Error) Y = None if StatContainer.ground_truth is not None: df = DataGuru.removeGT(df, StatContainer.ground_truth) Y = StatContainer.filterGroundTruth() # Remove nans: df, Y = DataGuru.removenan(df, Y) else: df.dropna(inplace=True) # Get the tsne model # Code to run the classifier X = df.values # Get a standard scaler for the extracted data X scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) # Train the classifier numbers = findNumbers(target.data, 1) if numbers != [] and numbers[0].data > 0: num_clusters = int(numbers[0].data) else: num_clusters = 2 # If not specified use 2 clusters kY = self.performOperation(X, num_clusters) result_objects = [] if StatContainer.ground_truth is not None: df_res = pd.DataFrame() df_res['ground_truth'] = Y df_res['clustering_result'] = kY df_res.pivot_table(index=df_res.columns[0], columns=df_res.columns[1], aggfunc=np.size, fill_value=0) win = Window.window() f = win.gcf() ax = f.add_subplot(111) df_res = DataGuru.convertStrCols_toNumeric(df_res) sns.heatmap(df_res, ax=ax) win.show() if data_frame is not None: result_object = VizContainer.createResult( win, data_frame, ['clstr.fig']) else: result_object = VizContainer.createResult( win, array_datas, ['clstr.fig']) result_objects.append(result_object) result_object = ResultObject(kY, [], DataType.array, CommandStatus.Success) result_object.createName(cname, command_name="clstr", set_keyword_list=True) result_objects.append(result_object) return result_objects
def evaluate(self, array_data, target): result = ResultObject(None, None, None, CommandStatus.Error) in_array = array_data.data N = in_array.shape[0] if StatContainer.conditional_array is not None and len( StatContainer.conditional_array.data) == N: in_array = in_array[StatContainer.conditional_array.data] if in_array.size == 0: Printer.Print("No data") return result nan_idx = StatContainer.getNanIdx(in_array) non_nan_idx = np.logical_not(nan_idx) non_nan_array = in_array[non_nan_idx] numbers = findNumbers(target.data, 1) try: unique_arr, inv, counts = np.unique(non_nan_array, return_inverse=True, return_counts=True) except: return result if numbers != [] and numbers[0].data > 0: num = int(numbers[0].data) idx = None if not np.issubdtype(non_nan_array.dtype, np.number): num = min(unique_arr.size, num) if self._condition[0] == "top": Printer.Print("Finding top", num) if np.issubdtype(non_nan_array.dtype, np.number): best_idx = np.argpartition(non_nan_array, -num)[-num:] idx = np.full(non_nan_array.size, False) idx[best_idx] = True if num <= 30: if StatContainer.row_labels is not None: df_new = pd.DataFrame( {array_data.name: non_nan_array[best_idx]}) df_new[ StatContainer.row_labels. name] = StatContainer.row_labels.data[best_idx] TablePrinter.printDataFrame(df_new) TablePrinter.sort(0, ascending=False) else: Printer.Print("Top values:") Printer.Print(non_nan_array[best_idx]) else: best_idx = np.argpartition(counts, -num)[-num:] idx = np.isin(inv, best_idx) if num <= 30: Printer.Print("Top values:") Printer.Print(unique_arr[best_idx]) elif self._condition[0] == "bottom": Printer.Print("Finding bottom", num) if np.issubdtype(non_nan_array.dtype, np.number): worst_idx = np.argpartition(non_nan_array, -num)[:num] idx = np.full(non_nan_array.size, False) idx[worst_idx] = True if num <= 30: if StatContainer.row_labels is not None: df_new = pd.DataFrame( {array_data.name: non_nan_array[worst_idx]}) df_new[StatContainer.row_labels. name] = StatContainer.row_labels.data[ worst_idx] TablePrinter.printDataFrame(df_new) TablePrinter.sort(0, ascending=True) else: Printer.Print("Worst values:") Printer.Print(non_nan_array[worst_idx]) else: worst_idx = np.argpartition(counts, num)[:num] idx = np.isin(inv, worst_idx) if num <= 30: Printer.Print("Worst values:") Printer.Print(unique_arr[worst_idx]) elif self._condition[0] == "first": Printer.Print(array_data.data[:num]) result = ResultObject(None, None, None, CommandStatus.Success) else: Printer.Print("Did not find the right condition") if idx is not None: out1 = np.full(in_array.size, False) out1[non_nan_idx] = idx if StatContainer.conditional_array is not None and len( StatContainer.conditional_array.data) == N: out = np.full(N, False) out[StatContainer.conditional_array.data] = out1 else: out = out1 result = ResultObject(out, [], DataType.logical_array, CommandStatus.Success, True) result.createName(array_data.keyword_list, command_name=self._condition[0], set_keyword_list=True) elif self._condition[0] == "first": if unique_arr.size < 50: Printer.Print(unique_arr) else: Printer.Print(non_nan_array[:10]) result = ResultObject(None, None, None, CommandStatus.Success) return result
def evaluate(self, array_datas): """ Calculate label-wise mean array store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) if isinstance(array_datas, list) and len(array_datas) == 0: return result_object command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame( array_datas) if command_status == CommandStatus.Error: return ResultObject(None, None, None, CommandStatus.Error) if StatContainer.ground_truth is None: gtVals = np.ones(df.shape[0]) gtName = 'ground_truth' else: gtVals = StatContainer.filterGroundTruth() gtName = StatContainer.ground_truth.name # Remove nans: df[gtName] = gtVals df.dropna(inplace=True) gtVals = df[gtName] uniqVals = StatContainer.isCategorical(gtVals, uniqueCutoff=1000) binned_ground_truth = True if uniqVals is None and np.issubdtype(gtVals.dtype, np.number): # Convert to categorical df[gtName] = pd.cut(gtVals, 10) binned_ground_truth = True # Create groupwise arrays result_objects = [] if uniqVals is not None: df_new = self.performOperation(df, gtName) df_new = df_new.reset_index() for col in df_new.columns: arr = df_new[col] kName = [] if col == '': kName = array_datas[0].keyword_list else: # kName.append(cname) kName.append(col) result_object = ResultObject(arr, [], DataType.array, CommandStatus.Success) command_name = 'labelwise.' + self._condition[0] result_object.createName(kName, command_name=command_name, set_keyword_list=True) result_objects.append(result_object) TablePrinter.printDataFrame(df_new) else: Printer.Print("The array is not of numeric type so cannot", "calculate groupwise " + self._condition[0]) result_objects.append(result_object) return result_objects