Ejemplo n.º 1
0
 def evaluate(self, array_data, user_conv):
     try:
         if isinstance(array_data.data[0], str):
             date_time = pd.to_datetime(array_data.data,
                                        infer_datetime_format=True)
             array_data.data = date_time
         else:
             date_time = array_data.data
         if not isinstance(array_data.data[0], pd.datetime):
             raise RuntimeError()
     except:
         Printer.Print("Cannot transform data to date time")
         return ResultObject(None, None, None, CommandStatus.Error)
     results = []
     for word in ['day', 'year', 'month', 'hour', 'minute']:
         if word in user_conv.data or word + 's' in user_conv.data:
             out = getattr(date_time, word)
             result = ResultObject(out, [], DataType.array)
             result.createName(array_data.keyword_list,
                               command_name=word,
                               set_keyword_list=True)
             results.append(result)
             Printer.Print('Saving ', word, 'from ', array_data.name, ' as',
                           result.name)
     if results != []:
         return results
     return ResultObject(None, None, None, CommandStatus.Success)
Ejemplo n.º 2
0
    def evaluate(self, history, user_conv, name=None):
        """
        Saves the last element from history and saves it with given name
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if 'notebook' in user_conv.data or 'chat' in user_conv.data:
            Printer.save(name)
            return ResultObject(None, None, None, CommandStatus.Success)
        elif 'table' in user_conv.data:
            result = save_table(name, user_conv)
            if not result:
                return result_object
            return ResultObject(None, None, None, CommandStatus.Success)
        if name is None:
            return result_object
        try:
            previous_result = history.data.getLastObject()
            name_lower = name.data.lower()
            keyword_list = name_lower.split(' ')
            result_object = ResultObject(previous_result.data, keyword_list,
                                         history.data.last_data_type,
                                         CommandStatus.Success)
            result_object.createName(keyword_list)
            Printer.Print("Saving ", ' '.join(previous_result.keyword_list),
                          ' as ', result_object.name)
        except RuntimeError:
            Printer.Print("Cannot find last object from history")

        return result_object
Ejemplo n.º 3
0
    def evaluate(self, csv_data):
        """
        Transform a csv to its standardized counterpart

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        data = csv_data.data.copy()

        # if numpy.issubdtype(data.dtype, numpy.number):
        for column in data.columns:
            col_data_drop = data[column].dropna()
            uniqVals = StatContainer.isCategorical(col_data_drop)
            if (uniqVals is None and
                len(col_data_drop) > 0 and
                isinstance(col_data_drop.iloc[0], str) == False):
                data[column] = ((data[column] - numpy.mean(col_data_drop)) /
                                numpy.std(col_data_drop))

        Printer.Print("Saving the scaled data...")
        result_object = ResultObject(data, [],
                                     DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(csv_data.keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)

        return result_object
Ejemplo n.º 4
0
 def createResult(self, out, keyword_list):
     result = ResultObject(out, [], DataType.logical_array,
                           CommandStatus.Success, True)
     result.createName(keyword_list,
                       command_name='between',
                       set_keyword_list=True)
     return result
Ejemplo n.º 5
0
    def evaluate(self, array_data):
        """
        Calculate sum of all elements of the array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data

        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
            if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
                idx = numpy.logical_and(idx,
                                        StatContainer.conditional_array.data)
            mean_val = numpy.sum(array[idx])
            result_object = ResultObject(mean_val, [], DataType.array,
                                         CommandStatus.Success)
            result_object.createName(array_data.keyword_list,
                                     command_name=self.commandTags()[0],
                                     set_keyword_list=True)
            Printer.Print("Sum of", array_data.name, "is", mean_val)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "take sum")

        return result_object
Ejemplo n.º 6
0
    def evaluate(self, array_data):
        N = len(array_data)
        if N < 1:
            return ResultObject(None, None, None, CommandStatus.Error)
        out = array_data[0].data
        Printer.Print("Performing logical", self._add_tags[0], "on ")
        Printer.Print(array_data[0].name)
        if self._operator == '!':
            out = np.logical_not(array_data[0].data)

        for arr_data in array_data[1:]:
            Printer.Print(", ", arr_data.name)
            if self._operator == '&':
                out = np.logical_and(out, arr_data.data)
            elif self._operator == '||':
                out = np.logical_or(out, arr_data.data)
            elif self._operator == '^':
                out = np.logical_xor(out, arr_data.data)
            else:
                return ResultObject(None, None, None, CommandStatus.Error)
            Printer.Print(arr_data.name)
        if StatContainer.conditional_array is not None:
            non_filt_idx = np.logical_not(StatContainer.conditional_array)
            out[non_filt_idx] = False
        result = ResultObject(out, [], DataType.logical_array,
                              CommandStatus.Success, True)
        if len(array_data) > 1:
            keyword_list2 = array_data[1].keyword_list
        else:
            keyword_list2 = []
        result.createName(array_data[0].keyword_list,
                          keyword_list2,
                          command_name=self._add_tags[0],
                          set_keyword_list=True)
        return result
Ejemplo n.º 7
0
    def read(self, file_path, keyword_list):
        try:
            property_data, model_name = self.createProperties(file_path)
            model = DataGuru.createModel(property_data, model_name)
        except:
            Printer.Print("File not found")
            return ResultObject(None, None, None, CommandStatus.Error)

        command_status = CommandStatus.Success
        result_data = [model, property_data, model_name, self.updateModel]
        result_object = ResultObject(result_data,
                                     keyword_list,
                                     DataType.algorithm_arg,
                                     command_status,
                                     add_to_cache=True)
        result_object.createName(keyword_list)

        if (PropertyEditor.parent_widget is None
                or PropertyEditor.property_editor_class is None):
            Printer.Print("Cannot modify algorithm properties in non-GUI mode")
        else:
            property_editor = PropertyEditor.property_editor_class(
                result_object)
            PropertyEditor.addPropertyEditor(property_editor)

        return result_object
Ejemplo n.º 8
0
    def evaluate(self, array_data):
        """
        Calculate stdev value of the array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data
        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
            if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
                idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
            std_val = numpy.std(array[idx])
            result_object = ResultObject(
                std_val, [], DataType.array, CommandStatus.Success)
            result_object.createName(
                    array_data.keyword_list,
                    command_name=self.commandTags()[0],
                    set_keyword_list=True)
            df_new = pd.DataFrame()
            df_new['Feature'] = [array_data.name]
            df_new['Standard Deviation'] = [std_val]
            TablePrinter.printDataFrame(df_new)

            # Printer.Print("Standard deviation of", array_data.name,
            #              "is", std_val)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "find stdev")

        return result_object
Ejemplo n.º 9
0
    def evaluate(self, array_datas):
        if not isinstance(array_datas, collections.Iterable):
            array_datas = [array_datas]
        N = array_datas[0].data.size
        out = np.full(N, 'Unknown', dtype='U40')
        out_filter = np.full(N, False)
        Printer.Print("Creating a categorical array from: ")
        for array_data in array_datas:
            Printer.Print(array_data.name)
            if array_data.data.size == N:
                out[array_data.data] = array_data.name
                out_filter[array_data.data] = True
        kl1 = [" ".join(array_data.keyword_list) for array_data in array_datas]
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        if common_name == '':
            common_name_list = array_data[0].keyword_list
        else:
            common_name_list = common_name.split(' ')

        result = ResultObject(out, [], DataType.array,
                              CommandStatus.Success)
        result.createName(common_name_list, command_name='categorical',
                          set_keyword_list=True)
        result_filter = ResultObject(out_filter, [], DataType.logical_array,
                                     CommandStatus.Success, True)
        result_filter.createName(common_name_list, command_name='filter',
                                 set_keyword_list=True)
        Printer.Print('Saving categorical array as', result.name)
        Printer.Print('Saving filter as', result_filter.name)
        return [result, result_filter]
Ejemplo n.º 10
0
    def evaluate(self, array_data):
        """
        Calculate count (number of values) of an array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data
        if StatContainer.conditional_array is not None:
            array = array[StatContainer.conditional_array.data]
        nan_idx = StatContainer.getNanIdx(array)
        if numpy.issubdtype(array.dtype, numpy.number):
            array_filtered = array[numpy.logical_not(nan_idx)]
            count_val = numpy.count_nonzero(array_filtered)

            result_object = ResultObject(count_val, [],
                                         DataType.array,
                                         CommandStatus.Success)
            result_object.createName(
                    array_data.keyword_list,
                    command_name=self.commandTags()[0],
                    set_keyword_list=True)
            Printer.Print("Count of", array_data.name, "is", count_val)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "find count")

        return result_object
Ejemplo n.º 11
0
    def evaluate(self, data_frame, classifier_algo):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        Printer.Print("Training the classifier")
        df_show = pd.DataFrame()
        df_show['Features'] = df.columns

        TablePrinter.printDataFrame(df_show)
        model.fit(X, Y)

        # Print an update
        Printer.Print("The classifier", classifier_algo.name,
                      "has been trained")

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy))

        trained_model = {'Scaler': scaler, 'Model': model}

        result_object = ResultObject(trained_model, [], DataType.trained_model,
                              CommandStatus.Success)

        classifier_algo_name = classifier_algo.name.replace('.', ' ')
        result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name,
                          set_keyword_list=True)

        return result_object
Ejemplo n.º 12
0
    def evaluate(self, data_frame, target):
        """
        Use one of the models to identify the top predictors
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num = int(numbers[0].data)
        else:
            num = 10  # If not specified select top 10 features

        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        model = RandomForestClassifier(n_estimators=100)
        model.fit(X, Y)
        featImpVals = model.feature_importances_

        featimp = pd.Series(featImpVals,
                            index=df.columns).sort_values(ascending=False)

        df_show = pd.DataFrame()
        df_show['top features'] = featimp.index[0:num]
        df_show['feature importance'] = featimp.values[0:num]
        TablePrinter.printDataFrame(df_show)
        df_new = df[featimp.index[0:num]]

        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'top.predictors'
        result_object.createName(data_frame.name,
                                 command_name=command_name,
                                 set_keyword_list=True)

        return result_object
Ejemplo n.º 13
0
 def createResult(self, out, keyword_list, create_name=True):
     result = ResultObject(out, [], DataType.logical_array,
                           CommandStatus.Success, True)
     if create_name:
         result.createName(keyword_list,
                           command_name=self._condition[0],
                           set_keyword_list=True)
     else:
         result.keyword_list = keyword_list
     return result
Ejemplo n.º 14
0
 def read(self, file_path, keyword_list):
     """
     Load the file name specified and store it in history
     Parameters:
         file_path file location which is expected to be of type csv
         keyword_list keywords used to describe the database
     """
     result_object = ResultObject(None, None, None, CommandStatus.Error)
     skipped_files = 0
     mod_file_path = self.findFilePath(file_path)
     if mod_file_path is not None:
         # try:
         data_frame = pd.read_csv(mod_file_path)
         self.checkHeaders(data_frame.columns.values)
         result_list = []
         for idx, row in data_frame.iterrows():
             try:
                 file_type = DataType[row['file_type']]
             except KeyError:
                 # Depending on verbosity
                 Printer.Print("file type in line ", idx,
                               " not understood in", row['file_name'])
                 Printer.Print("Skipping file ...")
                 skipped_files = skipped_files + 1
                 continue
             if file_type == DataType.folder:
                 Printer.Print("Loading folder: ", row['file_name'])
                 read_folder = ReadFolder()
                 result = read_folder.read(
                     row['file_name'], row['keywords'].split(),
                     'recursive' == row['description'])
                 if result.command_status == CommandStatus.Success:
                     result_list.append(result)
                 else:
                     Printer.Print("Failed to load folder: ",
                                   row['file_name'])
                 continue
             row_file_path = self.findFilePath(row['file_name'])
             if row_file_path is None:
                 Printer.Print("Cannot find file: ", row['file_name'])
                 continue
             file_object = FileObject(row_file_path, file_type,
                                      row['description'], False)
             keywords = row['keywords'].split(' ')
             file_res = ResultObject(file_object, keywords,
                                     DataType.file_name)
             file_res.createName(keywords)
             result_list.append(file_res)
         result_object = result_list
         # except:
         #    result_object = ResultObject(None, None, None, CommandStatus.Error)
     return result_object
Ejemplo n.º 15
0
 def evaluate(self, array_data, target):
     split_target = splitPattern(target.data)
     out = np.array([
         self.containsWordList(data, split_target)
         for data in array_data.data
     ])
     result = ResultObject(out, [], DataType.logical_array,
                           CommandStatus.Success, True)
     result.createName(array_data.keyword_list,
                       split_target,
                       command_name='contains',
                       set_keyword_list=True)
     return result
Ejemplo n.º 16
0
    def evaluate(self, array_data):
        """
        Calculate max value of the array and store it to history
        Parameters:

        """
        result_objects = []
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data

        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
        elif numpy.issubdtype(array.dtype, numpy.datetime64):
            idx = numpy.logical_not(numpy.isnat(array))
        else:
            Printer.Print("The array is not supported type so cannot find max")
            return result_object
        if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
            idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
        max_val = numpy.max(array[idx])
        idx = numpy.argmax(array[idx])
        if StatContainer.row_labels is not None:
            rl = StatContainer.row_labels.data
            max_rl = rl[idx]
            # Result for max index
            result_object = ResultObject(max_rl, [], DataType.array,
                                         CommandStatus.Success)

            result_object.createName(StatContainer.row_labels.name,
                                     command_name=self.commandTags()[0],
                                     set_keyword_list=True)
            result_objects.append(result_object)
        # Result for max value
        result_object = ResultObject(max_val, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_data.keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)
        result_objects.append(result_object)

        # Create a dataframe to store the results
        df_new = pd.DataFrame()
        df_new['Feature'] = [array_data.name]
        df_new['Maximum'] = [max_val]
        if StatContainer.row_labels is not None:
            df_new[StatContainer.row_labels.name] = [max_rl]
            #Printer.Print("Maximum of", array_data.name, "is", max_val, "corresponding to", max_rl)
        # else:
        #Printer.Print("Maximum of", array_data.name, "is", max_val)
        TablePrinter.printDataFrame(df_new)
        return result_objects
Ejemplo n.º 17
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if len(cname) == 0:
                cname = ".".join(kl1)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        df_new = self.performOperation(df)
        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created CSV
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        command_name = "smry"
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)

            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Ejemplo n.º 18
0
    def createResult(self, window, array_datas, in_keywords):
        figure = window.gcf()
        fig_keywords = []
        fig_keywords.append('figure')
        fig_keywords.append(str(figure.number))
        fig_keywords = fig_keywords + in_keywords
        if not isinstance(array_datas, collections.Iterable):
            array_datas = [array_datas]
        # TODO Later try adding some room for error like its there in 70% of the arrays
        common_kl = set.intersection(*[set(array_data.keyword_list) for array_data in array_datas])
        fig_keywords = fig_keywords + list(common_kl)

        result_object = ResultObject(window, fig_keywords, DataType.figure, CommandStatus.Success, add_to_cache=True)
        result_object.createName(fig_keywords)
        self.current_figure = figure
        return result_object
Ejemplo n.º 19
0
    def read(self, file_path, keyword_list):
        try:
            data = imread(file_path)
        except:
            return ResultObject(None, None, None,
                                command_status=CommandStatus.Error)

        win = Window.window()
        #f = win.gcf()
        plt.imshow(data)
        plt.gca().axis('off')
        win.show()
        # Initialize image manipulation command group
        result = ResultObject(data, keyword_list, DataType.image,
                              CommandStatus.Success, add_to_cache=True)
        result.createName(keyword_list)
        return result
Ejemplo n.º 20
0
 def add_categories_as_columns(self, uniqVals, col_data, col_split,
                               result_objects, command_status):
     """
         Module to convert a categorical column into a bunch of logical
         arrays
     """
     for uniV in uniqVals:
         categ_data = col_data == uniV
         categ_name = str(uniV)
         category_split = [
             key_val.lower() for key_val in splitPattern(categ_name)
         ]
         category_keyword_list = category_split + col_split
         result_object = ResultObject(categ_data * 1, category_keyword_list,
                                      DataType.logical_array,
                                      command_status)
         result_object.createName(category_keyword_list)
         result_objects.append(result_object)
     return result_objects
Ejemplo n.º 21
0
    def evaluate(self, array_data):
        """
        Calculate range value of the array and store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        array = array_data.data

        if numpy.issubdtype(array.dtype, numpy.number):
            idx = numpy.logical_not(numpy.isnan(array))
        elif numpy.issubdtype(array.dtype, numpy.datetime64):
            idx = numpy.logical_not(numpy.isnat(array))
        else:
            Printer.Print("The array is not supported type so cannot find max")
            return result_object
        if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size:
            idx = numpy.logical_and(idx, StatContainer.conditional_array.data)
        max_val = numpy.max(array[idx])
        min_val = numpy.min(array[idx])
        range_val = max_val - min_val
        result_object = ResultObject(range_val, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_data.keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)

        df_new = pd.DataFrame()
        df_new['Feature'] = [array_data.name]
        df_new['Range'] = [range_val]
        df_new['Minimum'] = [min_val]
        df_new['Maximum'] = [max_val]

        TablePrinter.printDataFrame(df_new)
        # Printer.Print("Range of", array_data.name, "is", range_val,
        #       "from", min_val, "to", max_val)

        return result_object
Ejemplo n.º 22
0
    def evaluate(self, array_datas):
        """
        Create a a new dataframe using the supplied arrays

        """
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print(
                "Please check whether the arrays are of the same size")
            return ResultObject(None, None, None, CommandStatus.Error)

        result_object = ResultObject(df, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'concatenate.array'
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        TablePrinter.printDataFrame(df)

        return result_object
Ejemplo n.º 23
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between two variables

        """
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print("please try the following command:",
                          "subtract a from b")
            return ResultObject(None, None, None, CommandStatus.Error)
        df_array = df.as_matrix()
        try:
            out = df_array[:, 1] - df_array[:, 0]
        except:
            return ResultObject(None, None, None, CommandStatus.Error)
        result_object = ResultObject(out, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_datas[0].keyword_list,
                                 array_datas[1].keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)
        return result_object
Ejemplo n.º 24
0
    def read(self, file_path, keyword_list, recursive=False, folder_database=None):
        """
        Load the file name specified and store it in history
        Parameters:
            file_path folder location
            keyword_list keywords used to describe the folder
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if folder_database is None:
            folder_database = TypeDatabase(data_type_list=[DataType.csv, DataType.image])
            create_result = True
        else:
            create_result = False

        if len(keyword_list) == 0:
            keyword_list = splitPattern(file_path)
        if not os.path.isdir(file_path):
            file_path = os.path.join(os.path.expanduser('~'),
                                     file_path)
            if not os.path.isdir(file_path):
                print("Cannot find folder: ", file_path)
                return result_object
        for dir_entry in os.scandir(file_path):
            if self.checkEndsWith(dir_entry.name, ['.csv', '.xlsx']) and dir_entry.is_file():
                self.addFile(dir_entry, DataType.csv, folder_database, file_path)
            elif self.checkEndsWith(dir_entry.name, ['.png', '.jpg', '.JPG', '.jpeg']) and dir_entry.is_file():
                self.addFile(dir_entry, DataType.image, folder_database, file_path)
            if recursive and dir_entry.is_dir():
                dir_keywords = splitPattern(dir_entry.name)
                self.read(os.path.join(file_path, dir_entry.name), keyword_list + dir_keywords, True, folder_database)
        if not create_result:
            return False
        folder_object = FolderObject(folder_database, file_path)
        result_object = ResultObject(folder_object, keyword_list, DataType.folder, CommandStatus.Success)
        result_object.createName(keyword_list)
        return result_object
Ejemplo n.º 25
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ttest of the array and store it to history
        Parameters:

        """

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            print("Could not find the reference variable.")
            print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['pValue: ' + str(iter) + ' vs ' +
                       str(iter1)] = np.zeros(df_new.shape[0])

        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            for iter in range(len(uniqVals)):
                uniV = uniqVals[iter]
                a = arr[gtVals == uniV]
                for iter1 in range(iter + 1, len(uniqVals)):
                    b = arr[gtVals == uniqVals[iter1]]
                    if uniV != uniqVals[iter1]:
                        ttest_val = scipy.stats.ttest_ind(a,
                                                          b,
                                                          axis=0,
                                                          equal_var=False)
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = (ttest_val.pvalue)
                    else:
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = 0

        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created csv
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='sigtest',
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'sigtest'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)
        return result_objects
Ejemplo n.º 26
0
    def preRead(self, file_path, keyword_list):
        command_status = CommandStatus.Success
        try:
            data = pd.read_csv(file_path)
        except:
            try:
                data = pd.read_excel(file_path)
            except:
                return ResultObject("File not found", None, None,
                                    CommandStatus.Error)
        result_objects = []
        result_object = ResultObject(data,
                                     keyword_list,
                                     DataType.csv,
                                     command_status,
                                     add_to_cache=True)
        result_object.createName(result_object.keyword_list)
        result_objects.append(result_object)
        # Too many columns do not extract them individually
        if len(data.columns) > 5000:
            return result_objects
        new_column_names = []
        # num_unique = float("inf")  # Used for smallest unique vec finding
        #current_gt = None
        for column in data.columns:
            if self.col_head_pattern.match(column):
                data.drop(column, axis=1, inplace=True)
                continue
            else:
                col_split = splitPattern(column)
            col_data = data[column].values
            col_keyword_list = col_split

            N = col_data.size
            if N == 0:
                continue
            if isinstance(col_data[0], str):
                if '%' in col_data[0]:
                    try:
                        col_data = data[column].str.rstrip('%').astype(
                            float, copy=False)
                        data[column] = col_data
                        if 'percent' not in col_keyword_list:
                            col_keyword_list.append('percent')
                    except ValueError:
                        pass
                elif '$' in col_data[0] or ',' in col_data[0]:
                    try:
                        col_data = data[column].str.translate(
                            self.currency_dict).astype(float, copy=False)
                        data[column] = col_data
                        if '$' not in col_keyword_list:
                            col_keyword_list.append('$')
                    except ValueError:
                        pass
            result_object = ResultObject(col_data,
                                         col_keyword_list,
                                         DataType.array,
                                         command_status,
                                         add_to_cache=True)
            result_object.createName(col_keyword_list)
            new_column_names.append(result_object.name)
            result_objects.append(result_object)
            # For now removing unique value search which is pretty slow
            #unique_vals = StatContainer.isCategorical(col_data)
            # if unique_vals is not None:
            #    if len(unique_vals) < num_unique:
            #        current_gt = result_object
            #        num_unique = len(unique_vals)
            #    # Do not add unique values as columns unless they are only a
            #    # few
            #    # if len(unique_vals) < 5:
            #    #    result_objects = self.add_categories_as_columns(
            #    #    unique_vals, col_data, col_split,
            #    #    result_objects, command_status)
        # Replace columns:
        data.columns = new_column_names
        # if current_gt is not None:
        #    StatContainer.ground_truth = current_gt
        return result_objects
Ejemplo n.º 27
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ROC of the array and store it to history
        Parameters:

        """
        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Could not find the reference variable.")
            Printer.Print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                Printer.Print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                Printer.Print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['AUC'] = 0

        avgAUC = []
        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            model = LogisticRegression()
            X = arr.values
            X1 = X.reshape(-1, 1)
            model.fit(X1, gtVals)
            # evaluate the model
            allAUC = []
            Y_Pr = model.predict_proba(X1)
            for iter in range(len(uniqVals)):
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                auc_val = metrics.auc(fpr, tpr)
                allAUC.append(auc_val)
            avgAUC.append(np.mean(allAUC))
        df_new['AUC'] = avgAUC

        TablePrinter.printDataFrame(df_new)

        # New data frame
        result_objects = []
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='rcurve',
                                 set_keyword_list=True)

        result_objects.append(result_object)

        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'rcurve'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Ejemplo n.º 28
0
    def evaluate(self, data_frame, array_datas, target):
        """
        Run clustering on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data

            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)

            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)

        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Get the tsne model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num_clusters = int(numbers[0].data)
        else:
            num_clusters = 2  # If not specified use 2 clusters

        kY = self.performOperation(X, num_clusters)
        result_objects = []
        if StatContainer.ground_truth is not None:
            df_res = pd.DataFrame()
            df_res['ground_truth'] = Y
            df_res['clustering_result'] = kY
            df_res.pivot_table(index=df_res.columns[0],
                               columns=df_res.columns[1],
                               aggfunc=np.size,
                               fill_value=0)
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)

            df_res = DataGuru.convertStrCols_toNumeric(df_res)

            sns.heatmap(df_res, ax=ax)
            win.show()
            if data_frame is not None:
                result_object = VizContainer.createResult(
                    win, data_frame, ['clstr.fig'])
            else:
                result_object = VizContainer.createResult(
                    win, array_datas, ['clstr.fig'])
            result_objects.append(result_object)

        result_object = ResultObject(kY, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name="clstr",
                                 set_keyword_list=True)

        result_objects.append(result_object)
        return result_objects
Ejemplo n.º 29
0
 def evaluate(self, array_data, target):
     result = ResultObject(None, None, None, CommandStatus.Error)
     in_array = array_data.data
     N = in_array.shape[0]
     if StatContainer.conditional_array is not None and len(
             StatContainer.conditional_array.data) == N:
         in_array = in_array[StatContainer.conditional_array.data]
     if in_array.size == 0:
         Printer.Print("No data")
         return result
     nan_idx = StatContainer.getNanIdx(in_array)
     non_nan_idx = np.logical_not(nan_idx)
     non_nan_array = in_array[non_nan_idx]
     numbers = findNumbers(target.data, 1)
     try:
         unique_arr, inv, counts = np.unique(non_nan_array,
                                             return_inverse=True,
                                             return_counts=True)
     except:
         return result
     if numbers != [] and numbers[0].data > 0:
         num = int(numbers[0].data)
         idx = None
         if not np.issubdtype(non_nan_array.dtype, np.number):
             num = min(unique_arr.size, num)
         if self._condition[0] == "top":
             Printer.Print("Finding top", num)
             if np.issubdtype(non_nan_array.dtype, np.number):
                 best_idx = np.argpartition(non_nan_array, -num)[-num:]
                 idx = np.full(non_nan_array.size, False)
                 idx[best_idx] = True
                 if num <= 30:
                     if StatContainer.row_labels is not None:
                         df_new = pd.DataFrame(
                             {array_data.name: non_nan_array[best_idx]})
                         df_new[
                             StatContainer.row_labels.
                             name] = StatContainer.row_labels.data[best_idx]
                         TablePrinter.printDataFrame(df_new)
                         TablePrinter.sort(0, ascending=False)
                     else:
                         Printer.Print("Top values:")
                         Printer.Print(non_nan_array[best_idx])
             else:
                 best_idx = np.argpartition(counts, -num)[-num:]
                 idx = np.isin(inv, best_idx)
                 if num <= 30:
                     Printer.Print("Top values:")
                     Printer.Print(unique_arr[best_idx])
         elif self._condition[0] == "bottom":
             Printer.Print("Finding bottom", num)
             if np.issubdtype(non_nan_array.dtype, np.number):
                 worst_idx = np.argpartition(non_nan_array, -num)[:num]
                 idx = np.full(non_nan_array.size, False)
                 idx[worst_idx] = True
                 if num <= 30:
                     if StatContainer.row_labels is not None:
                         df_new = pd.DataFrame(
                             {array_data.name: non_nan_array[worst_idx]})
                         df_new[StatContainer.row_labels.
                                name] = StatContainer.row_labels.data[
                                    worst_idx]
                         TablePrinter.printDataFrame(df_new)
                         TablePrinter.sort(0, ascending=True)
                     else:
                         Printer.Print("Worst values:")
                         Printer.Print(non_nan_array[worst_idx])
             else:
                 worst_idx = np.argpartition(counts, num)[:num]
                 idx = np.isin(inv, worst_idx)
                 if num <= 30:
                     Printer.Print("Worst values:")
                     Printer.Print(unique_arr[worst_idx])
         elif self._condition[0] == "first":
             Printer.Print(array_data.data[:num])
             result = ResultObject(None, None, None, CommandStatus.Success)
         else:
             Printer.Print("Did not find the right condition")
         if idx is not None:
             out1 = np.full(in_array.size, False)
             out1[non_nan_idx] = idx
             if StatContainer.conditional_array is not None and len(
                     StatContainer.conditional_array.data) == N:
                 out = np.full(N, False)
                 out[StatContainer.conditional_array.data] = out1
             else:
                 out = out1
             result = ResultObject(out, [], DataType.logical_array,
                                   CommandStatus.Success, True)
             result.createName(array_data.keyword_list,
                               command_name=self._condition[0],
                               set_keyword_list=True)
     elif self._condition[0] == "first":
         if unique_arr.size < 50:
             Printer.Print(unique_arr)
         else:
             Printer.Print(non_nan_array[:10])
         result = ResultObject(None, None, None, CommandStatus.Success)
     return result
Ejemplo n.º 30
0
    def evaluate(self, array_datas):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if isinstance(array_datas, list) and len(array_datas) == 0:
            return result_object
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            gtVals = np.ones(df.shape[0])
            gtName = 'ground_truth'
        else:
            gtVals = StatContainer.filterGroundTruth()
            gtName = StatContainer.ground_truth.name

        # Remove nans:
        df[gtName] = gtVals
        df.dropna(inplace=True)

        gtVals = df[gtName]
        uniqVals = StatContainer.isCategorical(gtVals, uniqueCutoff=1000)
        binned_ground_truth = True

        if uniqVals is None and np.issubdtype(gtVals.dtype, np.number):
            # Convert to categorical
            df[gtName] = pd.cut(gtVals, 10)
            binned_ground_truth = True

        # Create groupwise arrays
        result_objects = []

        if uniqVals is not None:
            df_new = self.performOperation(df, gtName)

            df_new = df_new.reset_index()
            for col in df_new.columns:
                arr = df_new[col]
                kName = []
                if col == '':
                    kName = array_datas[0].keyword_list
                else:
                    # kName.append(cname)
                    kName.append(col)

                result_object = ResultObject(arr, [], DataType.array,
                                             CommandStatus.Success)
                command_name = 'labelwise.' + self._condition[0]
                result_object.createName(kName,
                                         command_name=command_name,
                                         set_keyword_list=True)

                result_objects.append(result_object)
            TablePrinter.printDataFrame(df_new)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "calculate groupwise " + self._condition[0])
            result_objects.append(result_object)

        return result_objects