def test_plot_figures_output_show():
    from pandas import DataFrame
    plot_confusion_matrix(DataFrame({
        'True': [2, 3],
        'False': [5, 0]
    }),
                          otype='show')
    plot_confusion_matrix(DataFrame({
        '7th-8th': [2, 3, 5, 0],
        'Masters': [0, 4, 1, 0],
        '11th': [0, 1, 5, 2],
        'Bachelors': [2, 0, 0, 6]
    }),
                          otype='show')

    bins = np.array([
        28., 29.25, 30.5, 31.75, 33., 34.25, 35.5, 36.75, 38., 39.25, 40.5,
        41.75, 43., 44.25, 45.5, 46.75, 48., 49.25, 50.5
    ])
    counts = np.array(
        [[1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
         [1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
    plot_histogram(bins, counts, otype='show')

    bins = np.array(['Private', 'Self-emp-not-inc', 'State-gov'])
    counts = np.array([[6, 2, 1], [6, 2, 1]])
    plot_histogram(bins, counts, otype='show')

    bins = np.array(['11th', '9th', 'Bachelors', 'HS-grad', 'Masters'])
    counts = np.array([[3, 2, 2, 1, 1], [3, 2, 2, 1, 1]])
    plot_histogram(bins, counts, otype='show')

    bins = np.array([
        5., 5.45, 5.9, 6.35, 6.8, 7.25, 7.7, 8.15, 8.6, 9.05, 9.5, 9.95, 10.4,
        10.85, 11.3, 11.75, 12.2, 12.65, 13.1
    ])
    counts = np.array(
        [[1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
         [1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0]])
    plot_histogram(bins, counts, otype='show')

    bins = np.array(['Female', 'Male'])
    counts = np.array([[5, 4], [5, 4]])
    plot_histogram(bins, counts, otype='show')

    from .testdata import adults01
    from ds4ml.metrics import pairwise_mutual_information
    data = pairwise_mutual_information(DataFrame(adults01))
    plot_heatmap(data, otype='show')
Ejemplo n.º 2
0
 def _get_dist(self):
     """ return the distribution information """
     from ds4ml.utils import plot_histogram
     dists = []
     for col in self.columns:
         bins, counts = self.dist(col)
         svg = plot_histogram(bins, counts)
         dists.append({
             'name': col,
             'columns': bins,
             'data': counts,
             'path': svg
         })
     return dists
Ejemplo n.º 3
0
def test_plot_figures_output_show_special_characters():
    bins = np.array(['你好', 'Self-る', '¥¶ĎǨД'])
    counts = np.array([[6, 2, 1], [6, 2, 1]])
    plot_histogram(bins, counts, otype='show')
Ejemplo n.º 4
0
    def to_html(self,
                buf=None,
                title='Evaluation Report',
                info=True,
                distribute=True,
                correlate=True,
                classifier=None,
                labels=None,
                test=None):
        """
        Render the evaluation result of two data set as an HTML file.

        Parameters
        ----------
        buf : optional
            buffer to write to

        title : str
            title of evaluation report

        info : bool, default true
            show basic information of two data set, including relative error,
            and Jensen-Shannon divergence (jsd).

        distribute : bool, default true
            show distribution of each attribute.

        correlate : bool, default true
            show correlation of pair-wise attributes.

        classifier : str
            use classifier to train data set on one or more columns (defined by
            parameter 'label') and show prediction result on the evaluation
            report. Optional classifier: SVM.

        labels : list of column names
            column name, or a list of column names separated by comma, used for
            classification task.

        test : pd.DataFrame
            test data for classification, and other machine learning tasks.
        """
        from ds4ml.utils import (plot_histogram, plot_heatmap,
                                 plot_confusion_matrix)
        from mako.template import Template
        import os
        old_cwd = os.getcwd()
        os.chdir(os.path.dirname(__file__))
        template = Template(filename='template/report.html')
        os.chdir(old_cwd)

        topics = []
        content = {}
        # format different kinds of evaluation result to unified style
        if info:
            topics.append('basic')
            content['basic'] = [self.describe().to_dict('split')]

        if distribute:
            topics.append('dist')
            content['dist'] = []
            for col in self.columns:
                bins, counts = self.dist(col)
                svg = plot_histogram(bins, counts)
                content['dist'].append({
                    'name': col,
                    'columns': bins,
                    'data': counts,
                    'path': svg
                })

        if correlate:
            topics.append('corr')
            content['corr'] = []
            source_mi, target_mi = self.corr()
            source_svg = plot_heatmap(source_mi)
            target_svg = plot_heatmap(target_mi)
            content['corr'].append({
                'matrix': source_mi.to_dict('split'),
                'path': source_svg
            })
            content['corr'].append({
                'matrix': target_mi.to_dict('split'),
                'path': target_svg
            })

        if labels is not None:
            topics.append('svm')
            content['svm'] = []
            for col in labels:
                in_test = (test is not None and col in test) or (test is None)
                if in_test:
                    # When class label in svm classify test data, try to match
                    # two predicted result with the actual data, and so, there
                    # will be two confusion matrix diagrams.
                    try:
                        source_cm, target_cm = self.classify(col, test=test)
                        vrange = (min(source_cm.values.min(),
                                      target_cm.values.min()),
                                  max(source_cm.values.max(),
                                      target_cm.values.max()))
                        path = (plot_confusion_matrix(source_cm,
                                                      vrange=vrange,
                                                      xlabel='raw',
                                                      ylabel='actual'),
                                plot_confusion_matrix(target_cm,
                                                      vrange=vrange,
                                                      xlabel='synth',
                                                      ylabel='actual'))
                        content['svm'].append({'column': col, 'path': path})
                    except ValueError as e:
                        print(e)
                else:
                    # If not, will compare two predicted result.
                    try:
                        cm = self.classify(col, test=test)
                        # make path's type: 1-tuple
                        path = (plot_confusion_matrix(cm,
                                                      xlabel='synth',
                                                      ylabel='raw'), )
                        content['svm'].append({'column': col, 'path': path})
                    except ValueError as e:
                        print(e)

        svms = content['svm'] if 'svm' in content else []
        if buf:
            with open(buf, 'w+', encoding='utf-8') as file:
                file.write(
                    template.render(title=title,
                                    basics=content['basic'],
                                    dists=content['dist'],
                                    corrs=content['corr'],
                                    svms=svms))