Python DataGuru.removenanの例、Alfarvis.Toolboxes.DataGuru.DataGuru.removenan Pythonの例

コード例 #1

0

ファイルを表示

    def evaluate(self, array_datas):
        """
        Create a scatter plot between multiple variables

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
        if command_status == CommandStatus.Error:
            return result_object
        if len(df.columns) <= 1:
            Printer.Print("There needs to be atleast two variables to perform multiscatter plot!")
            return result_object

        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax)
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            pd.plotting.scatter_matrix(df, alpha=0.2, diagonal='kde', c=row_colors, cmap="jet", ax=ax)

        f.suptitle(cname)

        win.show()

        return VizContainer.createResult(win, array_datas, ['multiscatter'])

コード例 #2

0

ファイルを表示

    def evaluate(self, data_frame, classifier_algo):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        Printer.Print("Training the classifier")
        df_show = pd.DataFrame()
        df_show['Features'] = df.columns

        TablePrinter.printDataFrame(df_show)
        model.fit(X, Y)

        # Print an update
        Printer.Print("The classifier", classifier_algo.name,
                      "has been trained")

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy on training set : %s" % "{0:.3%}".format(accuracy))

        trained_model = {'Scaler': scaler, 'Model': model}

        result_object = ResultObject(trained_model, [], DataType.trained_model,
                              CommandStatus.Success)

        classifier_algo_name = classifier_algo.name.replace('.', ' ')
        result_object.createName(data_frame.keyword_list, command_name=classifier_algo_name,
                          set_keyword_list=True)

        return result_object

コード例 #3

0

ファイルを表示

ファイル: DataMine_RunClassifier.py プロジェクト: alfarvis/ALFA

    def evaluate(self, data_frame, classifier_model):
        """
        Run a trained classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        df = data_frame.data
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth to",
                          "to get the prediction accuracy")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)
        X = df.values

        # Get the classifier model
        trained_model = classifier_model.data
        model = trained_model['Model']
        scaler = trained_model['Scaler']

        # Scale the values based on the training standardizer
        X = scaler.transform(X)

        # Code to run the classifier
        # Plot the classification result
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)
        Printer.Print('Running the trained classifier...')

        predictions = model.predict(X)
        accuracy = metrics.accuracy_score(predictions, Y)
        Printer.Print("Accuracy : %s" % "{0:.3%}".format(accuracy))
        cm = metrics.confusion_matrix(Y, predictions)
        DataGuru.plot_confusion_matrix(cm,
                                       np.unique(Y),
                                       ax,
                                       title="confusion matrix")
        win.show()

        # TODO Need to save the result
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object

コード例 #4

0

ファイルを表示

ファイル: NLDR_isomap.py プロジェクト: alfarvis/ALFA

    def evaluate(self, data_frame, array_datas):
        """
        Run Isomap on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            df, Y = DataGuru.removenan(df, Y)
        # Remove nans:
        else:
            df.dropna(inplace=True)

        # Get the Isomap model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier

        win = Window.window()

        properties = self.createDefaultProperties()
        properties['title'] = cname

        # return ResultObject(None, None, None, CommandStatus.Success)
        if data_frame is not None:
            result_object = VizContainer.createResult(win, data_frame,
                                                      ['ismp'])
        else:
            result_object = VizContainer.createResult(win, array_datas,
                                                      ['ismp'])

        result_object.data = [win, properties, [X, Y], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object

コード例 #5

0

ファイルを表示

ファイル: DataMine_TopPred.py プロジェクト: alfarvis/ALFA

    def evaluate(self, data_frame, target):
        """
        Use one of the models to identify the top predictors
        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)

        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num = int(numbers[0].data)
        else:
            num = 10  # If not specified select top 10 features

        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        model = RandomForestClassifier(n_estimators=100)
        model.fit(X, Y)
        featImpVals = model.feature_importances_

        featimp = pd.Series(featImpVals,
                            index=df.columns).sort_values(ascending=False)

        df_show = pd.DataFrame()
        df_show['top features'] = featimp.index[0:num]
        df_show['feature importance'] = featimp.values[0:num]
        TablePrinter.printDataFrame(df_show)
        df_new = df[featimp.index[0:num]]

        result_object = ResultObject(df_new, [], DataType.csv,
                                     CommandStatus.Success)

        command_name = 'top.predictors'
        result_object.createName(data_frame.name,
                                 command_name=command_name,
                                 set_keyword_list=True)

        return result_object

コード例 #6

0

ファイルを表示

ファイル: DataMine_kFoldCV.py プロジェクト: alfarvis/ALFA

    def preEvaluate(self, data_frame, array_datas, classifier_algo):
        result_object = ResultObject(None, None, None, CommandStatus.Error)
        # Get the data frame
        sns.set(color_codes=True)
        if data_frame is not None:
            df = data_frame.data
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas)
            if command_status == CommandStatus.Error:
                print("Error in getting dataframe!")
                result_object.data = "Error in getting dataframe!"
                return result_object
        else:
            result_object.data = "Please provide data frame or arrays to analyze"
            return result_object

        # Get the ground truth array
        if StatContainer.ground_truth is None:
            result_object.data = ("Please set a feature vector to ground truth by" +
                                  "typing set ground truth before using this command")
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data
        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        # Get the classifier model
        model = classifier_algo.data[0]

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        properties = self.createDefaultProperties()
        properties['title'] = cname
        cv_output = self.performCV(properties, X, Y, model)
        aux_output = (properties, [X, Y, model])

        return [ResultObject(cv_output, None),
                ResultObject(aux_output, None)]

コード例 #7

0

ファイルを表示

    def evaluate(self, array_datas):
        """
        Create a scatter plot between two variables

        """
        sns.set(color_codes=True)
        command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
            array_datas)
        if command_status == CommandStatus.Error:
            Printer.Print("please try the following command:",
                          "Visualize comparison between...")
            return ResultObject(None, None, None, CommandStatus.Error)

        properties = self.createDefaultProperties()
        properties['title'] = cname

        win = Window.window()
        row_colors = None
        if StatContainer.ground_truth is None or len(StatContainer.ground_truth.data) != df.shape[0]:
            df.dropna(inplace=True)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            array = df.values
        else:
            gt1 = pd.Series(StatContainer.filterGroundTruth())
            df, gt1 = DataGuru.removenan(df, gt1)
            if df.shape[0] == 0:
                return ResultObject(None, None, None, CommandStatus.Error)
            lut = dict(zip(gt1.unique(), np.linspace(0, 1, gt1.unique().size)))
            row_colors = gt1.map(lut)
            array = df.values

        result_object = VizContainer.createResult(
            win, array_datas, ['scatter2d'])
        result_object.data = [win, properties, [
            array, row_colors, kl1], self.updateFigure]
        self.updateFigure(result_object.data)
        self.modify_figure.evaluate(result_object)
        return result_object

コード例 #8

0

ファイルを表示

ファイル: DataMine_BestClassifier.py プロジェクト: alfarvis/ALFA

    def evaluate(self, data_frame, classifier_algos):
        """
        Train a classifier on multiple arrays

        """
        result_object = ResultObject(None, None, None, CommandStatus.Error)

        # Get the data frame
        sns.set(color_codes=True)
        df = data_frame.data
        #command_status, df, kl1, _ = DataGuru.transformArray_to_dataFrame(array_datas)
        # if command_status == CommandStatus.Error:
        #    return ResultObject(None, None, None, CommandStatus.Error)

        # Get the ground truth array
        if StatContainer.ground_truth is None:
            Printer.Print("Please set a feature vector to ground truth by",
                          "typing set ground truth before using this command")
            result_object = ResultObject(None, None, None, CommandStatus.Error)
            return result_object
        else:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.ground_truth.data

        # Remove nans:
        df, Y = DataGuru.removenan(df, Y)

        Printer.Print("Training classifier using the following features:")
        Printer.Print(df.columns)

        # Get all the classifier models to test against each other
        modelList = []
        Printer.Print("Testing the following classifiers: ")
        for classifier_algo in classifier_algos:
            model = (classifier_algo.data[0])
            Printer.Print(classifier_algo.name)
            modelList.append({'Name': classifier_algo.name, 'Model': model})

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        Printer.Print('Finding the best classifier using',
                      'k fold cross validation...')

        all_cv_scores, all_mean_cv_scores, all_confusion_matrices = DataGuru.FindBestClassifier(X, Y, modelList, 10)

        Printer.Print('\n\nPlotting the confusion matrices...\n')
        for iter in range(len(modelList)):
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)
            DataGuru.plot_confusion_matrix(all_confusion_matrices[iter], np.unique(Y), ax, title=modelList[iter]['Name'])
            win.show()

        Printer.Print("\n\nBest classifier is " + modelList[np.argmax(all_mean_cv_scores)]['Name'] + " with an accuracy of -  %.2f%% " % max(all_mean_cv_scores))
        # TODO Need to save the model
        # Ask user for a name for the model
        result_object = ResultObject(None, None, None, CommandStatus.Success)

        return result_object

コード例 #9

0

ファイルを表示

ファイル: NLDR_pca.py プロジェクト: alfarvis/ALFA

    def evaluate(self, data_frame, array_datas):
        """
        Run pca on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data
            df = DataGuru.convertStrCols_toNumeric(df)
            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)
            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)
        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()
            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        pca = PCA(n_components=2)
        pca_res = pca.fit_transform(X)
        win = Window.window()
        f = win.gcf()
        ax = f.add_subplot(111)

        if Y is None:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
        else:
            sc = ax.scatter(pca_res[:, 0],
                            pca_res[:, 1],
                            c=Y,
                            cmap="jet",
                            edgecolor="None",
                            alpha=0.35)
            cbar = plt.colorbar(sc)
            cbar.ax.get_yaxis().labelpad = 15
            cbar.ax.set_ylabel(StatContainer.ground_truth.name, rotation=270)

        ax.set_title(cname)
        win.show()
        # return ResultObject(None, None, None, CommandStatus.Success)

        if data_frame is not None:
            return VizContainer.createResult(win, data_frame, ['pca'])
        else:
            return VizContainer.createResult(win, array_datas, ['pca'])

コード例 #10

0

ファイルを表示

    def evaluate(self, data_frame, array_datas, target):
        """
        Run clustering on a dataset of multiple arrays

        """

        # Get the data frame
        if data_frame is not None:
            df = data_frame.data

            cname = data_frame.name
        elif array_datas is not None:
            command_status, df, kl1, cname = DataGuru.transformArray_to_dataFrame(
                array_datas, useCategorical=True)

            if command_status == CommandStatus.Error:
                return ResultObject(None, None, None, CommandStatus.Error)

        else:
            Printer.Print("Please provide data frame or arrays to analyze")
            return ResultObject(None, None, None, CommandStatus.Error)
        Y = None
        if StatContainer.ground_truth is not None:
            df = DataGuru.removeGT(df, StatContainer.ground_truth)
            Y = StatContainer.filterGroundTruth()

            # Remove nans:
            df, Y = DataGuru.removenan(df, Y)
        else:
            df.dropna(inplace=True)

        # Get the tsne model

        # Code to run the classifier
        X = df.values

        # Get a standard scaler for the extracted data X
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

        # Train the classifier
        numbers = findNumbers(target.data, 1)
        if numbers != [] and numbers[0].data > 0:
            num_clusters = int(numbers[0].data)
        else:
            num_clusters = 2  # If not specified use 2 clusters

        kY = self.performOperation(X, num_clusters)
        result_objects = []
        if StatContainer.ground_truth is not None:
            df_res = pd.DataFrame()
            df_res['ground_truth'] = Y
            df_res['clustering_result'] = kY
            df_res.pivot_table(index=df_res.columns[0],
                               columns=df_res.columns[1],
                               aggfunc=np.size,
                               fill_value=0)
            win = Window.window()
            f = win.gcf()
            ax = f.add_subplot(111)

            df_res = DataGuru.convertStrCols_toNumeric(df_res)

            sns.heatmap(df_res, ax=ax)
            win.show()
            if data_frame is not None:
                result_object = VizContainer.createResult(
                    win, data_frame, ['clstr.fig'])
            else:
                result_object = VizContainer.createResult(
                    win, array_datas, ['clstr.fig'])
            result_objects.append(result_object)

        result_object = ResultObject(kY, [], DataType.array,
                                     CommandStatus.Success)
        result_object.createName(cname,
                                 command_name="clstr",
                                 set_keyword_list=True)

        result_objects.append(result_object)
        return result_objects