Beispiel #1
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
        if command_status == CommandStatus.Error:
            return result_object
        if len(df.columns) <= 1:
            Printer.Print("There needs to be atleast two variables to perform multiscatter plot!")
            return result_object

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax)
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax)

        f.suptitle(cname)

        win.show()

        return VizContainer.createResult(win, array_datas, ['multiscatter'])
Beispiel #2
0
    def evaluate(self, array_datas):
        """
        Create a box plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None or len(
                StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            df.boxplot(ax=ax)
        else:
            ground_truth = StatContainer.ground_truth.name
            df[ground_truth] = StatContainer.filterGroundTruth()
            df.dropna(inplace=True)
            df.boxplot(by=ground_truth, ax=ax)
            f.suptitle("")
        win.show()

        return VizContainer.createResult(win, array_datas, ['box'])
Beispiel #3
0
    def evaluate(self, data_frame, array_datas, classifier_algo, pre_evaluate_results=None):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        if type(pre_evaluate_results) is not list:
            Printer.Print("Pre evaluation results failed! Attach bug report!")
            return result_object
        win = Window.window()

        if data_frame is not None:
            result_object = VizContainer.createResult(win, data_frame, ['cval'])
        elif array_datas is not None:
            result_object = VizContainer.createResult(win, array_datas, ['cval'])
        else:
            Printer.Print("Provide one of data frame or array datas")
            return result_object
        cv_output, aux_output = pre_evaluate_results
        properties, model_data = aux_output.data

        result_object.data = [win, properties, model_data, self.processkFoldCV]
        self.printkValueMessage(cv_output.data[0])
        self.updateWindow(win, cv_output.data[1], cv_output.data[2], model_data[1], properties["title"])
        self.modify_figure.evaluate(result_object)
        return result_object
Beispiel #4
0
    def evaluate(self, array_datas):
        """
        Create a line plot 

        """
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True, expand_single=True,
                remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        elif (df.shape[0] == 0 or
              (df.shape[1] == 1 and
               np.issubdtype(array_datas[0].data.dtype, np.number) == False)):
            Printer.Print("No data left to plot after cleaning up!")
            return ResultObject(None, None, None, CommandStatus.Error)

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        ax.set_title(cname)
        df.plot(ax=ax)

        win.show()

        return VizContainer.createResult(win, array_datas, ['line'])
Beispiel #5
0
    def evaluate(self, array_datas):
        """
        Visualize the relationship between variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        df = pd.DataFrame()
        for array_data in array_datas:
            if (np.issubdtype(array_data.data.dtype, np.number)) == True:
                Printer.Print("The data to plot is not categorical, Please use scatter plot")
                return result_object
            df[" ".join(array_data.keyword_list)] = array_data.data

        df.dropna(inplace=True)
        df = df.pivot_table(
            index=df.columns[0], columns=df.columns[1], aggfunc=np.size, fill_value=0)

        Printer.Print("Displaying heatmap")
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        sns.heatmap(df, ax=ax)

        win.show()
        return VizContainer.createResult(win, array_datas, ['heatmap'])
Beispiel #6
0
    def evaluate(self, image):
        """
        Display the image specified
        """
        try:
            if image.data_type is DataType.file_name:
                file_path = image.data.path
                if not os.path.isfile(file_path):
                    Printer.Print("Cannot find image file: ", file_path)
                    raise RuntimeError
                curr_image = imread(file_path)
                result_object = ResultObject(
                    curr_image, image.keyword_list, DataType.image, CommandStatus.Success)
            else:
                curr_image = image.data
                result_object = ResultObject(
                    None, None, None, CommandStatus.Success)
            image_name = image.keyword_list[0]
            win = Window.window()
            plt.imshow(curr_image)
            plt.gca().axis('off')
            win.show()
            Printer.Print("Displaying image" + image_name)
        except:
            result_object = ResultObject(None, None, None, CommandStatus.Error)

        return result_object
Beispiel #7
0
    def evaluate(self, array_datas):
        """
        Displaying a heatmap for data visualization 

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        Printer.Print("Displaying heatmap")
        win = Window.window()
        f = win.gcf()
        if StatContainer.ground_truth is None:
            sns.clustermap(df,
                           cbar=True,
                           square=False,
                           annot=False,
                           cmap='jet',
                           standard_scale=1)
        else:
            gt1 = pd.Series(StatContainer.ground_truth.data)
            lut = dict(zip(gt1.unique(), "rbg"))
            row_colors = gt1.map(lut)
            sns.clustermap(df,
                           standard_scale=1,
                           row_colors=row_colors,
                           cmap="jet")

        win.show()
        return VizContainer.createResult(win, array_datas, ['heatmap'])
Beispiel #8
0
    def evaluate(self, array_datas):
        """
        Create a violin plot for multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            sns.violinplot(data=df, ax=ax)
        else:
            ground_truth = " ".join(StatContainer.ground_truth.keyword_list)
            df[ground_truth] = StatContainer.filterGroundTruth()
            df.dropna(inplace=True)
            df1 = pd.melt(df, id_vars=ground_truth)
            sns.violinplot(data=df1, ax=ax, x='variable', y='value', hue=ground_truth)

        win.show()

        return VizContainer.createResult(win, array_datas, ['violin'])
    def evaluate(self, data_frame, classifier_model):
        """
        Run a trained classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        df = data_frame.data
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth to",
                          "to get the prediction accuracy")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)
        X = df.values

        # Get the classifier model
        trained_model = classifier_model.data
        model = trained_model['Model']
        scaler = trained_model['Scaler']

        # Scale the values based on the training standardizer
        X = scaler.transform(X)

        # Code to run the classifier
        # Plot the classification result
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        Printer.Print('Running the trained classifier...')

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy : %s" % "{0:.3%}".format(accuracy))
        cm = metrics.confusion_matrix(Y, predictions)
        DataGuru.plot_confusion_matrix(cm,
                                       np.unique(Y),
                                       ax,
                                       title="confusion matrix")
        win.show()

        # TODO Need to save the result
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object
Beispiel #10
0
    def evaluate(self, data_frame, array_datas):
        """
        Run Isomap on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            df, Y = DataGuru.removenan(df, Y)
        # Remove nans:
        else:
            df.dropna(inplace=True)

        # Get the Isomap model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier

        win = Window.window()

        properties = self.createDefaultProperties()
        properties['title'] = cname

        # return ResultObject(None, None, None, CommandStatus.Success)
        if data_frame is not None:
            result_object = VizContainer.createResult(win, data_frame,
                                                      ['ismp'])
        else:
            result_object = VizContainer.createResult(win, array_datas,
                                                      ['ismp'])

        result_object.data = [win, properties, [X, Y], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
Beispiel #11
0
    def evaluate(self, array_data):
        """
        Create a pie plot 

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        stTitle = " ".join(array_data.keyword_list)
        if StatContainer.conditional_array is not None and len(
                StatContainer.conditional_array.data) == array_data.data.size:
            inds = StatContainer.conditional_array.data
            Printer.Print("Nfiltered: ", np.sum(inds))
        else:
            inds = np.full(array_data.data.size, True)
        col_data = pd.Series(array_data.data[inds], name='array')
        col_data.dropna(inplace=True)
        try:
            uniqVals, inv, counts = np.unique(col_data,
                                              return_inverse=True,
                                              return_counts=True)
        except:
            return ResultObject(None, None, None, CommandStatus.Error)
        if len(uniqVals) > self.max_unique:
            if isinstance(uniqVals[0], str):
                best_idx = np.argpartition(counts,
                                           -self.max_unique)[-self.max_unique:]
                idx = np.isin(inv, best_idx)
                col_data = col_data[idx]
            elif np.issubdtype(col_data.dtype, np.number):
                # Convert to categorical
                col_data = pd.cut(col_data, 10)
                uniqVals = True
            else:
                uniqVals = None

        if uniqVals is not None:
            counts = pd.Series(np.ones(col_data.size), name='count')
            concat_df = pd.concat([counts, col_data], axis=1)
            ds = concat_df.groupby(col_data.name).sum()['count']
        else:
            Printer.Print("Too many unique values to plot on a pie chart\n")
            Printer.Print("Please select another chart type")
            return result_object

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        ds.plot.pie(figsize=(8, 8), ax=ax)
        ax.set_title(stTitle)
        ax.set_xlabel('')
        ax.set_aspect('equal')

        win.show()
        return VizContainer.createResult(win, array_data, ['pie'])
Beispiel #12
0
    def evaluate(self, array_datas):
        """
        Create a histogram for multiple variables

        """

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, useCategorical=True, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)
        dCol = df[df.columns[0]]
        try:
            uniqVals, inv, counts = np.unique(dCol,
                                              return_inverse=True,
                                              return_counts=True)
        except:
            return ResultObject(None, None, None, CommandStatus.Error)
        if len(uniqVals) > self.max_unique:
            if isinstance(uniqVals[0], str):
                best_idx = np.argpartition(counts,
                                           -self.max_unique)[-self.max_unique:]
                idx = np.isin(inv, best_idx)
                dCol = dCol[idx]
            else:
                uniqVals = None
        if uniqVals is not None and isinstance(uniqVals[0], str):
            max_len = max([len(uniqVal) for uniqVal in uniqVals])
        else:
            max_len = 0

        if (uniqVals is None and not np.issubdtype(dCol.dtype, np.number)):
            Printer.Print("Too many unique values in non-numeric type data")
            return ResultObject(None, None, None, CommandStatus.Error)

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        # TODO Create an argument for setting number of bins
        if uniqVals is not None:
            if len(uniqVals) > 5 and max_len > 8:
                df = dCol.to_frame(name=kl1[0])
                sns.countplot(y=kl1[0], data=df, ax=ax)
            else:
                df = dCol.to_frame(name=kl1[0])
                sns.countplot(x=kl1[0], data=df, ax=ax)
        elif np.issubdtype(dCol.dtype, np.number):
            df.plot.hist(stacked=True, ax=ax)

        win.show()

        return VizContainer.createResult(win, array_datas,
                                         ['histogram', 'hist'])
Beispiel #13
0
    def read(self, file_path, keyword_list):
        try:
            data = imread(file_path)
        except:
            return ResultObject(None, None, None,
                                command_status=CommandStatus.Error)

        win = Window.window()
        #f = win.gcf()
        plt.imshow(data)
        plt.gca().axis('off')
        win.show()
        # Initialize image manipulation command group
        result = ResultObject(data, keyword_list, DataType.image,
                              CommandStatus.Success, add_to_cache=True)
        result.createName(keyword_list)
        return result
Beispiel #14
0
    def evaluate(self, array_datas):
        """
        Create a scatter plot between two variables

        """
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print("please try the following command:",
                          "Visualize comparison between...")
            return ResultObject(None, None, None, CommandStatus.Error)

        properties = self.createDefaultProperties()
        properties['title'] = cname

        win = Window.window()
        row_colors = None
        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            array = df.values
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            array = df.values

        result_object = VizContainer.createResult(
            win, array_datas, ['scatter2d'])
        result_object.data = [win, properties, [
            array, row_colors, kl1], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
Beispiel #15
0
    def evaluate(self, array_datas):
        """
        Find the correlation between two or more variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        if len(array_datas) < 2:
            Printer.Print("Need atleast two arrays to compute correlation")
            return ResultObject(None, None, None, CommandStatus.Error)

        sns.set(color_codes=True)
        command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(
            array_datas, remove_nan=True)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        corr_res = df.corr()

        if len(array_datas) == 2:
            Printer.Print("The correlation between ", kl1[0], " and ", kl1[1],
                          " is ", str(corr_res.values[0][1]))

        Printer.Print("Displaying the result as a heatmap")
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        sns.heatmap(corr_res,
                    cbar=True,
                    square=True,
                    annot=True,
                    fmt='.2f',
                    annot_kws={'size': 15},
                    xticklabels=df.columns,
                    yticklabels=df.columns,
                    cmap='jet',
                    ax=ax)
        win.show()
        return VizContainer.createResult(win, array_datas, ['correlation'])
Beispiel #16
0
    def evaluate(self, array_datas):
        """
        Create a bar plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            gtVals = np.ones(df.shape[0])
            ground_truth = 'ground_truth'
        else:
            gtVals = StatContainer.filterGroundTruth()
            ground_truth = StatContainer.ground_truth.name
            if len(gtVals) != df.shape[0]:
                print("ground truth does not match with df shape")
                print(len(gtVals), df.shape[0])
                gtVals = np.ones(df.shape[0])
                ground_truth = 'ground_truth'

        # Remove nans:
        df[ground_truth] = gtVals
        df.dropna(inplace=True)
        gtVals = df[ground_truth]
        uniqVals = StatContainer.isCategorical(gtVals)
        binned_ground_truth = False
        if uniqVals is None and np.issubdtype(gtVals.dtype, np.number):
            # Convert to categorical
            df[ground_truth] = pd.cut(gtVals, 10)
            binned_ground_truth = True

        if binned_ground_truth is True or uniqVals is not None:
            gb = df.groupby(ground_truth)
            df_mean = gb.mean()
            df_errors = gb.std()
            if uniqVals is not None and isinstance(uniqVals[0], str):
                truncated_uniqVals, _ = StatContainer.removeCommonNames(
                    df_mean.index)
                df_mean.index = truncated_uniqVals
                df_errors.index = truncated_uniqVals
            # Number of uniq_vals x number of arrs
            df_mean_shape = df_mean.shape
            if (not binned_ground_truth
                    and df_mean_shape[1] >= df_mean_shape[0]):
                df_mean = df_mean.T
                df_errors = df_errors.T
        else:
            Printer.Print("Ground truth could not be mapped to",
                          "categorical array\n")
            Printer.Print("Please clear or select appropriate ground truth")
            return result_object

        properties = self.createDefaultProperties()
        properties['title'] = cname
        if uniqVals is not None and isinstance(uniqVals[0], str):
            max_len = max([len(uniqVal) for uniqVal in uniqVals])
        else:
            max_len = 0
        if (binned_ground_truth or
            (uniqVals is not None and len(uniqVals) > 5 and max_len > 8)):
            properties["horizontal"] = True
        if binned_ground_truth:
            properties["overwrite_labels"] = True
            properties["ylabel"] = StatContainer.ground_truth.name
        win = Window.window()
        result_object = VizContainer.createResult(win, array_datas, ['bar'])
        result_object.data = [
            win, properties, [df_mean, df_errors], self.updateFigure
        ]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object
    def evaluate(self, data_frame, classifier_algos):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        # Get the ground truth array
        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        Printer.Print("Training classifier using the following features:")
        Printer.Print(df.columns)

        # Get all the classifier models to test against each other
        modelList = []
        Printer.Print("Testing the following classifiers: ")
        for classifier_algo in classifier_algos:
            model = (classifier_algo.data[0])
            Printer.Print(classifier_algo.name)
            modelList.append({'Name': classifier_algo.name, 'Model': model})

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        Printer.Print('Finding the best classifier using',
                      'k fold cross validation...')

        all_cv_scores, all_mean_cv_scores, all_confusion_matrices = DataGuru.FindBestClassifier(X, Y, modelList, 10)

        Printer.Print('\n\nPlotting the confusion matrices...\n')
        for iter in range(len(modelList)):
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)
            DataGuru.plot_confusion_matrix(all_confusion_matrices[iter], np.unique(Y), ax, title=modelList[iter]['Name'])
            win.show()

        Printer.Print("\n\nBest classifier is " + modelList[np.argmax(all_mean_cv_scores)]['Name'] + " with an accuracy of -  %.2f%% " % max(all_mean_cv_scores))
        # TODO Need to save the model
        # Ask user for a name for the model
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object
Beispiel #18
0
    def evaluate(self, data_frame, array_datas):
        """
        Run pca on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        pca = PCA(n_components=2)
        pca_res = pca.fit_transform(X)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if Y is None:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
        else:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            c=Y,
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
            cbar = plt.colorbar(sc)
            cbar.ax.get_yaxis().labelpad = 15
            cbar.ax.set_ylabel(StatContainer.ground_truth.name, rotation=270)

        ax.set_title(cname)
        win.show()
        # return ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            return VizContainer.createResult(win, data_frame, ['pca'])
        else:
            return VizContainer.createResult(win, array_datas, ['pca'])
Beispiel #19
0
    def evaluate(self, data_frame, array_datas, target):
        """
        Run clustering on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data

            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)

            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)

        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Get the tsne model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num_clusters = int(numbers[0].data)
        else:
            num_clusters = 2  # If not specified use 2 clusters

        kY = self.performOperation(X, num_clusters)
        result_objects = []
        if StatContainer.ground_truth is not None:
            df_res = pd.DataFrame()
            df_res['ground_truth'] = Y
            df_res['clustering_result'] = kY
            df_res.pivot_table(index=df_res.columns[0],
                               columns=df_res.columns[1],
                               aggfunc=np.size,
                               fill_value=0)
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)

            df_res = DataGuru.convertStrCols_toNumeric(df_res)

            sns.heatmap(df_res, ax=ax)
            win.show()
            if data_frame is not None:
                result_object = VizContainer.createResult(
                    win, data_frame, ['clstr.fig'])
            else:
                result_object = VizContainer.createResult(
                    win, array_datas, ['clstr.fig'])
            result_objects.append(result_object)

        result_object = ResultObject(kY, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name="clstr",
                                 set_keyword_list=True)

        result_objects.append(result_object)
        return result_objects