Esempio n. 1
0
 def updateWindow(self, win, cm, cvscores, Y, title):
     f = win.gcf()
     f.clear()
     ax = f.add_subplot(111)
     DataGuru.plot_confusion_matrix(cm, np.unique(Y), ax, title="confusion matrix")
     ax.set_title(title)
     win.show()
Esempio n. 2
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
        if command_status == CommandStatus.Error:
            return result_object
        if len(df.columns) <= 1:
            Printer.Print("There needs to be atleast two variables to perform multiscatter plot!")
            return result_object

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax)
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax)

        f.suptitle(cname)

        win.show()

        return VizContainer.createResult(win, array_datas, ['multiscatter'])
Esempio n. 3
0
    def evaluate(self, data_frame, classifier_algo):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        Printer.Print("Training the classifier")
        df_show = pd.DataFrame()
        df_show['Features'] = df.columns

        TablePrinter.printDataFrame(df_show)
        model.fit(X, Y)

        # Print an update
        Printer.Print("The classifier", classifier_algo.name,
                      "has been trained")

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy))

        trained_model = {'Scaler': scaler, 'Model': model}

        result_object = ResultObject(trained_model, [], DataType.trained_model,
                              CommandStatus.Success)

        classifier_algo_name = classifier_algo.name.replace('.', ' ')
        result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name,
                          set_keyword_list=True)

        return result_object
Esempio n. 4
0
    def evaluate(self, data_frame, classifier_model):
        """
        Run a trained classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        df = data_frame.data
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth to",
                          "to get the prediction accuracy")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)
        X = df.values

        # Get the classifier model
        trained_model = classifier_model.data
        model = trained_model['Model']
        scaler = trained_model['Scaler']

        # Scale the values based on the training standardizer
        X = scaler.transform(X)

        # Code to run the classifier
        # Plot the classification result
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        Printer.Print('Running the trained classifier...')

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy : %s" % "{0:.3%}".format(accuracy))
        cm = metrics.confusion_matrix(Y, predictions)
        DataGuru.plot_confusion_matrix(cm,
                                       np.unique(Y),
                                       ax,
                                       title="confusion matrix")
        win.show()

        # TODO Need to save the result
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object
Esempio n. 5
0
    def evaluate(self, data_frame, array_datas):
        """
        Run Isomap on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            df, Y = DataGuru.removenan(df, Y)
        # Remove nans:
        else:
            df.dropna(inplace=True)

        # Get the Isomap model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier

        win = Window.window()

        properties = self.createDefaultProperties()
        properties['title'] = cname

        # return ResultObject(None, None, None, CommandStatus.Success)
        if data_frame is not None:
            result_object = VizContainer.createResult(win, data_frame,
                                                      ['ismp'])
        else:
            result_object = VizContainer.createResult(win, array_datas,
                                                      ['ismp'])

        result_object.data = [win, properties, [X, Y], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
Esempio n. 6
0
    def evaluate(self, data_frame, target):
        """
        Use one of the models to identify the top predictors
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num = int(numbers[0].data)
        else:
            num = 10  # If not specified select top 10 features

        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        model = RandomForestClassifier(n_estimators=100)
        model.fit(X, Y)
        featImpVals = model.feature_importances_

        featimp = pd.Series(featImpVals,
                            index=df.columns).sort_values(ascending=False)

        df_show = pd.DataFrame()
        df_show['top features'] = featimp.index[0:num]
        df_show['feature importance'] = featimp.values[0:num]
        TablePrinter.printDataFrame(df_show)
        df_new = df[featimp.index[0:num]]

        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'top.predictors'
        result_object.createName(data_frame.name,
                                 command_name=command_name,
                                 set_keyword_list=True)

        return result_object
Esempio n. 7
0
    def performCV(self, properties, X, Y, model):
        try:
            kValue = int(properties["k (default: LOOCV)"])
        except:
            kValue = 0
        if kValue > X.shape[0]:
            kValue = 0

        if kValue == 0:
            cm, cvscores = DataGuru.runLOOCV(X, Y, model)
        else:
            cm, cvscores = DataGuru.runKFoldCV(X, Y, model, kValue)
        return kValue, cm, cvscores
Esempio n. 8
0
    def evaluate(self, array_datas):
        """
        Displaying a heatmap for data visualization 

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        Printer.Print("Displaying heatmap")
        win = Window.window()
        f = win.gcf()
        if StatContainer.ground_truth is None:
            sns.clustermap(df,
                           cbar=True,
                           square=False,
                           annot=False,
                           cmap='jet',
                           standard_scale=1)
        else:
            gt1 = pd.Series(StatContainer.ground_truth.data)
            lut = dict(zip(gt1.unique(), "rbg"))
            row_colors = gt1.map(lut)
            sns.clustermap(df,
                           standard_scale=1,
                           row_colors=row_colors,
                           cmap="jet")

        win.show()
        return VizContainer.createResult(win, array_datas, ['heatmap'])
Esempio n. 9
0
    def evaluate(self, array_datas):
        """
        Create a violin plot for multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            sns.violinplot(data=df, ax=ax)
        else:
            ground_truth = " ".join(StatContainer.ground_truth.keyword_list)
            df[ground_truth] = StatContainer.filterGroundTruth()
            df.dropna(inplace=True)
            df1 = pd.melt(df, id_vars=ground_truth)
            sns.violinplot(data=df1, ax=ax, x='variable', y='value', hue=ground_truth)

        win.show()

        return VizContainer.createResult(win, array_datas, ['violin'])
Esempio n. 10
0
    def evaluate(self, array_datas):
        """
        Create a box plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None or len(
                StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            df.boxplot(ax=ax)
        else:
            ground_truth = StatContainer.ground_truth.name
            df[ground_truth] = StatContainer.filterGroundTruth()
            df.dropna(inplace=True)
            df.boxplot(by=ground_truth, ax=ax)
            f.suptitle("")
        win.show()

        return VizContainer.createResult(win, array_datas, ['box'])
Esempio n. 11
0
    def evaluate(self, array_datas):
        """
        Create a line plot 

        """
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True, expand_single=True,
                remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        elif (df.shape[0] == 0 or
              (df.shape[1] == 1 and
               np.issubdtype(array_datas[0].data.dtype, np.number) == False)):
            Printer.Print("No data left to plot after cleaning up!")
            return ResultObject(None, None, None, CommandStatus.Error)

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        ax.set_title(cname)
        df.plot(ax=ax)

        win.show()

        return VizContainer.createResult(win, array_datas, ['line'])
Esempio n. 12
0
    def read(self, file_path, keyword_list):
        try:
            property_data, model_name = self.createProperties(file_path)
            model = DataGuru.createModel(property_data, model_name)
        except:
            Printer.Print("File not found")
            return ResultObject(None, None, None, CommandStatus.Error)

        command_status = CommandStatus.Success
        result_data = [model, property_data, model_name, self.updateModel]
        result_object = ResultObject(result_data,
                                     keyword_list,
                                     DataType.algorithm_arg,
                                     command_status,
                                     add_to_cache=True)
        result_object.createName(keyword_list)

        if (PropertyEditor.parent_widget is None
                or PropertyEditor.property_editor_class is None):
            Printer.Print("Cannot modify algorithm properties in non-GUI mode")
        else:
            property_editor = PropertyEditor.property_editor_class(
                result_object)
            PropertyEditor.addPropertyEditor(property_editor)

        return result_object
Esempio n. 13
0
    def preEvaluate(self, data_frame, array_datas, classifier_algo):
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        # Get the data frame
        sns.set(color_codes=True)
        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                print("Error in getting dataframe!")
                result_object.data = "Error in getting dataframe!"
                return result_object
        else:
            result_object.data = "Please provide data frame or arrays to analyze"
            return result_object

        # Get the ground truth array
        if StatContainer.ground_truth is None:
            result_object.data = ("Please set a feature vector to ground truth by" +
                                  "typing set ground truth before using this command")
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data
        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        properties = self.createDefaultProperties()
        properties['title'] = cname
        cv_output = self.performCV(properties, X, Y, model)
        aux_output = (properties, [X, Y, model])

        return [ResultObject(cv_output, None),
                ResultObject(aux_output, None)]
Esempio n. 14
0
    def evaluate(self, array_datas):
        """
        Create a histogram for multiple variables

        """

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, useCategorical=True, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        dCol = df[df.columns[0]]
        try:
            uniqVals, inv, counts = np.unique(dCol,
                                              return_inverse=True,
                                              return_counts=True)
        except:
            return ResultObject(None, None, None, CommandStatus.Error)
        if len(uniqVals) > self.max_unique:
            if isinstance(uniqVals[0], str):
                best_idx = np.argpartition(counts,
                                           -self.max_unique)[-self.max_unique:]
                idx = np.isin(inv, best_idx)
                dCol = dCol[idx]
            else:
                uniqVals = None
        if uniqVals is not None and isinstance(uniqVals[0], str):
            max_len = max([len(uniqVal) for uniqVal in uniqVals])
        else:
            max_len = 0

        if (uniqVals is None and not np.issubdtype(dCol.dtype, np.number)):
            Printer.Print("Too many unique values in non-numeric type data")
            return ResultObject(None, None, None, CommandStatus.Error)

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        # TODO Create an argument for setting number of bins
        if uniqVals is not None:
            if len(uniqVals) > 5 and max_len > 8:
                df = dCol.to_frame(name=kl1[0])
                sns.countplot(y=kl1[0], data=df, ax=ax)
            else:
                df = dCol.to_frame(name=kl1[0])
                sns.countplot(x=kl1[0], data=df, ax=ax)
        elif np.issubdtype(dCol.dtype, np.number):
            df.plot.hist(stacked=True, ax=ax)

        win.show()

        return VizContainer.createResult(win, array_datas,
                                         ['histogram', 'hist'])
Esempio n. 15
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between two variables

        """
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print("please try the following command:",
                          "Visualize comparison between...")
            return ResultObject(None, None, None, CommandStatus.Error)

        properties = self.createDefaultProperties()
        properties['title'] = cname

        win = Window.window()
        row_colors = None
        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            array = df.values
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            array = df.values

        result_object = VizContainer.createResult(
            win, array_datas, ['scatter2d'])
        result_object.data = [win, properties, [
            array, row_colors, kl1], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
Esempio n. 16
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if len(cname) == 0:
                cname = ".".join(kl1)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        df_new = self.performOperation(df)
        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created CSV
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        command_name = "smry"
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)

            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Esempio n. 17
0
    def evaluate(self, array_datas):
        """
        Find the correlation between two or more variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        if len(array_datas) < 2:
            Printer.Print("Need atleast two arrays to compute correlation")
            return ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        corr_res = df.corr()

        if len(array_datas) == 2:
            Printer.Print("The correlation between ", kl1[0], " and ", kl1[1],
                          " is ", str(corr_res.values[0][1]))

        Printer.Print("Displaying the result as a heatmap")
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        sns.heatmap(corr_res,
                    cbar=True,
                    square=True,
                    annot=True,
                    fmt='.2f',
                    annot_kws={'size': 15},
                    xticklabels=df.columns,
                    yticklabels=df.columns,
                    cmap='jet',
                    ax=ax)
        win.show()
        return VizContainer.createResult(win, array_datas, ['correlation'])
Esempio n. 18
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between two variables

        """
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print("please try the following command:",
                          "subtract a from b")
            return ResultObject(None, None, None, CommandStatus.Error)
        df_array = df.as_matrix()
        try:
            out = df_array[:, 1] - df_array[:, 0]
        except:
            return ResultObject(None, None, None, CommandStatus.Error)
        result_object = ResultObject(out, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(array_datas[0].keyword_list,
                                 array_datas[1].keyword_list,
                                 command_name=self.commandTags()[0],
                                 set_keyword_list=True)
        return result_object
Esempio n. 19
0
    def evaluate(self, array_datas):
        """
        Create a a new dataframe using the supplied arrays

        """
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print(
                "Please check whether the arrays are of the same size")
            return ResultObject(None, None, None, CommandStatus.Error)

        result_object = ResultObject(df, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'concatenate.array'
        result_object.createName(cname,
                                 command_name=command_name,
                                 set_keyword_list=True)

        TablePrinter.printDataFrame(df)

        return result_object
Esempio n. 20
0
 def updateModel(self, result_data):
     properties = result_data[1]
     model_name = result_data[2]
     print("Properties: ", properties)
     result_data[0] = DataGuru.createModel(properties, model_name)
Esempio n. 21
0
    def evaluate(self, array_datas):
        """
        Create a bar plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            gtVals = np.ones(df.shape[0])
            ground_truth = 'ground_truth'
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                print("ground truth does not match with df shape")
                print(len(gtVals), df.shape[0])
                gtVals = np.ones(df.shape[0])
                ground_truth = 'ground_truth'

        # Remove nans:
        df[ground_truth] = gtVals
        df.dropna(inplace=True)
        gtVals = df[ground_truth]
        uniqVals = StatContainer.isCategorical(gtVals)
        binned_ground_truth = False
        if uniqVals is None and np.issubdtype(gtVals.dtype, np.number):
            # Convert to categorical
            df[ground_truth] = pd.cut(gtVals, 10)
            binned_ground_truth = True

        if binned_ground_truth is True or uniqVals is not None:
            gb = df.groupby(ground_truth)
            df_mean = gb.mean()
            df_errors = gb.std()
            if uniqVals is not None and isinstance(uniqVals[0], str):
                truncated_uniqVals, _ = StatContainer.removeCommonNames(
                    df_mean.index)
                df_mean.index = truncated_uniqVals
                df_errors.index = truncated_uniqVals
            # Number of uniq_vals x number of arrs
            df_mean_shape = df_mean.shape
            if (not binned_ground_truth
                    and df_mean_shape[1] >= df_mean_shape[0]):
                df_mean = df_mean.T
                df_errors = df_errors.T
        else:
            Printer.Print("Ground truth could not be mapped to",
                          "categorical array\n")
            Printer.Print("Please clear or select appropriate ground truth")
            return result_object

        properties = self.createDefaultProperties()
        properties['title'] = cname
        if uniqVals is not None and isinstance(uniqVals[0], str):
            max_len = max([len(uniqVal) for uniqVal in uniqVals])
        else:
            max_len = 0
        if (binned_ground_truth or
            (uniqVals is not None and len(uniqVals) > 5 and max_len > 8)):
            properties["horizontal"] = True
        if binned_ground_truth:
            properties["overwrite_labels"] = True
            properties["ylabel"] = StatContainer.ground_truth.name
        win = Window.window()
        result_object = VizContainer.createResult(win, array_datas, ['bar'])
        result_object.data = [
            win, properties, [df_mean, df_errors], self.updateFigure
        ]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
Esempio n. 22
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ttest of the array and store it to history
        Parameters:

        """

        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            print("Could not find the reference variable.")
            print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['pValue: ' + str(iter) + ' vs ' +
                       str(iter1)] = np.zeros(df_new.shape[0])

        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            for iter in range(len(uniqVals)):
                uniV = uniqVals[iter]
                a = arr[gtVals == uniV]
                for iter1 in range(iter + 1, len(uniqVals)):
                    b = arr[gtVals == uniqVals[iter1]]
                    if uniV != uniqVals[iter1]:
                        ttest_val = scipy.stats.ttest_ind(a,
                                                          b,
                                                          axis=0,
                                                          equal_var=False)
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = (ttest_val.pvalue)
                    else:
                        df_new['pValue: ' + str(iter) + ' vs ' +
                               str(iter1)][iter_feature] = 0

        TablePrinter.printDataFrame(df_new)

        result_objects = []
        # Adding the newly created csv
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='sigtest',
                                 set_keyword_list=True)

        result_objects.append(result_object)
        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'sigtest'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)
        return result_objects
Esempio n. 23
0
    def evaluate(self, array_datas, data_frame):
        """
        Calculate ROC of the array and store it to history
        Parameters:

        """
        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Could not find the reference variable.")
            Printer.Print("Please set the reference variable")
            return ResultObject(None, None, None, CommandStatus.Error)
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                Printer.Print(
                    "The size of the ground truth does not match with arrays being analyzed"
                )
                Printer.Print(len(gtVals), df.shape[0])
                return ResultObject(None, None, None, CommandStatus.Error)

        uniqVals = StatContainer.isCategorical(gtVals)
        df[ground_truth] = gtVals
        df_new = pd.DataFrame()
        if ground_truth in df.columns:
            df_new['features'] = df.columns.drop(ground_truth).values
        else:
            df_new['features'] = df.columns

        allCols = df_new['features']
        for iter in range(len(uniqVals)):
            for iter1 in range(iter + 1, len(uniqVals)):
                df_new['AUC'] = 0

        avgAUC = []
        for iter_feature in range(len(df_new['features'])):
            arr = df[allCols[iter_feature]]
            model = LogisticRegression()
            X = arr.values
            X1 = X.reshape(-1, 1)
            model.fit(X1, gtVals)
            # evaluate the model
            allAUC = []
            Y_Pr = model.predict_proba(X1)
            for iter in range(len(uniqVals)):
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                fpr, tpr, thresholds = metrics.roc_curve(
                    gtVals, Y_Pr[:, iter], pos_label=uniqVals[iter])
                auc_val = metrics.auc(fpr, tpr)
                allAUC.append(auc_val)
            avgAUC.append(np.mean(allAUC))
        df_new['AUC'] = avgAUC

        TablePrinter.printDataFrame(df_new)

        # New data frame
        result_objects = []
        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name='rcurve',
                                 set_keyword_list=True)

        result_objects.append(result_object)

        # create an updated list of column names by removing the common names
        kl1 = df_new.columns
        truncated_kl1, common_name = StatContainer.removeCommonNames(kl1)
        for col in range(0, len(kl1)):
            arr = df_new[kl1[col]]
            result_object = ResultObject(arr, [], DataType.array,
                                         CommandStatus.Success)
            command_name = 'rcurve'
            result_object.createName(truncated_kl1[col],
                                     command_name=command_name,
                                     set_keyword_list=True)

            result_objects.append(result_object)

        return result_objects
Esempio n. 24
0
    def evaluate(self, data_frame, array_datas):
        """
        Run pca on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        pca = PCA(n_components=2)
        pca_res = pca.fit_transform(X)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if Y is None:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
        else:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            c=Y,
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
            cbar = plt.colorbar(sc)
            cbar.ax.get_yaxis().labelpad = 15
            cbar.ax.set_ylabel(StatContainer.ground_truth.name, rotation=270)

        ax.set_title(cname)
        win.show()
        # return ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            return VizContainer.createResult(win, data_frame, ['pca'])
        else:
            return VizContainer.createResult(win, array_datas, ['pca'])
Esempio n. 25
0
    def evaluate(self, array_datas):
        """
        Calculate label-wise mean array store it to history
        Parameters:

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if isinstance(array_datas, list) and len(array_datas) == 0:
            return result_object
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            gtVals = np.ones(df.shape[0])
            gtName = 'ground_truth'
        else:
            gtVals = StatContainer.filterGroundTruth()
            gtName = StatContainer.ground_truth.name

        # Remove nans:
        df[gtName] = gtVals
        df.dropna(inplace=True)

        gtVals = df[gtName]
        uniqVals = StatContainer.isCategorical(gtVals, uniqueCutoff=1000)
        binned_ground_truth = True

        if uniqVals is None and np.issubdtype(gtVals.dtype, np.number):
            # Convert to categorical
            df[gtName] = pd.cut(gtVals, 10)
            binned_ground_truth = True

        # Create groupwise arrays
        result_objects = []

        if uniqVals is not None:
            df_new = self.performOperation(df, gtName)

            df_new = df_new.reset_index()
            for col in df_new.columns:
                arr = df_new[col]
                kName = []
                if col == '':
                    kName = array_datas[0].keyword_list
                else:
                    # kName.append(cname)
                    kName.append(col)

                result_object = ResultObject(arr, [], DataType.array,
                                             CommandStatus.Success)
                command_name = 'labelwise.' + self._condition[0]
                result_object.createName(kName,
                                         command_name=command_name,
                                         set_keyword_list=True)

                result_objects.append(result_object)
            TablePrinter.printDataFrame(df_new)
        else:
            Printer.Print("The array is not of numeric type so cannot",
                          "calculate groupwise " + self._condition[0])
            result_objects.append(result_object)

        return result_objects
Esempio n. 26
0
    def evaluate(self, data_frame, classifier_algos):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        # Get the ground truth array
        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        Printer.Print("Training classifier using the following features:")
        Printer.Print(df.columns)

        # Get all the classifier models to test against each other
        modelList = []
        Printer.Print("Testing the following classifiers: ")
        for classifier_algo in classifier_algos:
            model = (classifier_algo.data[0])
            Printer.Print(classifier_algo.name)
            modelList.append({'Name': classifier_algo.name, 'Model': model})

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        Printer.Print('Finding the best classifier using',
                      'k fold cross validation...')

        all_cv_scores, all_mean_cv_scores, all_confusion_matrices = DataGuru.FindBestClassifier(X, Y, modelList, 10)

        Printer.Print('\n\nPlotting the confusion matrices...\n')
        for iter in range(len(modelList)):
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)
            DataGuru.plot_confusion_matrix(all_confusion_matrices[iter], np.unique(Y), ax, title=modelList[iter]['Name'])
            win.show()

        Printer.Print("\n\nBest classifier is " + modelList[np.argmax(all_mean_cv_scores)]['Name'] + " with an accuracy of -  %.2f%% " % max(all_mean_cv_scores))
        # TODO Need to save the model
        # Ask user for a name for the model
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object
Esempio n. 27
0
    def evaluate(self, data_frame, array_datas, target):
        """
        Run clustering on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data

            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)

            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)

        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Get the tsne model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num_clusters = int(numbers[0].data)
        else:
            num_clusters = 2  # If not specified use 2 clusters

        kY = self.performOperation(X, num_clusters)
        result_objects = []
        if StatContainer.ground_truth is not None:
            df_res = pd.DataFrame()
            df_res['ground_truth'] = Y
            df_res['clustering_result'] = kY
            df_res.pivot_table(index=df_res.columns[0],
                               columns=df_res.columns[1],
                               aggfunc=np.size,
                               fill_value=0)
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)

            df_res = DataGuru.convertStrCols_toNumeric(df_res)

            sns.heatmap(df_res, ax=ax)
            win.show()
            if data_frame is not None:
                result_object = VizContainer.createResult(
                    win, data_frame, ['clstr.fig'])
            else:
                result_object = VizContainer.createResult(
                    win, array_datas, ['clstr.fig'])
            result_objects.append(result_object)

        result_object = ResultObject(kY, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name="clstr",
                                 set_keyword_list=True)

        result_objects.append(result_object)
        return result_objects