Exemple #1
0
    def describeAdditiveComponents(self):
        """
        Generate a section describing all additive components present.
        """
        n_terms = len(self.kers)
        error = self.cums[-1].error()
        doc = self.doc
        with doc.create(pl.Section("Additive Component Analysis")):
            terms_str = "only one additive component" if n_terms == 1 else "{0} additive components".format(
                n_terms)
            s = r"The pattern underlying the dataset can be decomposed into " \
              + terms_str + ", " \
              + r"which contribute jointly to the final classifier which we have trained. " \
              + r"With all components in action, the classifier can achieve " \
              + r"a cross-validated classification error rate of {0:.2f}\%. ".format(error * 100) \
              + r"The performance cannot be further improved by adding more components. "
            doc.append(ut.NoEscape(s))

            s = "\n\nIn Table 2 we list the full additive model, " \
              + "all input variables, as well as " \
              + "more complex additive components (if any) considered above, " \
              + "ranked by their cross-validated error. "
            doc.append(ut.NoEscape(s))
            self.tabulateAll()

            for i in range(1, n_terms + 1):
                self.describeOneAdditiveComponent(i)
Exemple #2
0
    def describeVariables(self):
        """
        Generate a section describing all input dimensions / variables.
        """
        n_terms = len(self.best1d)
        doc = self.doc
        with doc.create(pl.Section("Individual Variable Analysis")):
            s = "First, we try to classify the training samples using only one " \
              + "of the {0:d} input dimensions. ".format(n_terms) \
              + "By considering the best classification performance that is " \
              + "achievable in each dimension, we are able to infer which " \
              + "dimensions are the most relevant to the class label assignment. "
            doc.append(ut.NoEscape(s))
            doc.append(ut.NoEscape("\n\n"))

            s = "Note that in Table 1 the baseline performance is also included, which is achieved " \
              + "with a constant GP kernel. The effect of the baseline classifier " \
              + "is to indiscriminately classify all samples as either positive " \
              + "or negative, whichever is more frequent in the training data. " \
              + "The classifier performance in each input dimension will be " \
              + "compared against this baseline to tell if the input variable " \
              + "is discriminative in terms of class determination. "
            doc.append(ut.NoEscape(s))

            for i in range(n_terms):
                self.describeOneVariable(self.best1d[i])
Exemple #3
0
    def makeIntro(self):
        """
        Make the leading paragraph.
        """
        doc = self.doc
        s = "This report is automatically generated by the AutoGPC. " \
          + "AutoGPC is an automatic Gaussian process (GP) classifier that " \
          + "aims to find the structure underlying a dataset without human input. " \
          + "For more information, please visit " \
          + r"\url{http://github.com/charles92/autogpc}."
        doc.append(ut.NoEscape(s))

        doc.append(ut.NoEscape("\n\n"))
        self.makeDataSummary()
Exemple #4
0
    def makeSummary(self):
        """
        Generate the summary section.
        """
        best = self.cums[-1]
        data = best.data
        summands = self.summands

        doc = self.doc
        with doc.create(pl.Section("Executive Summary")):

            dims = np.array(best.getActiveDims())
            # TODO: Shall we normalise?
            # xvar = np.square(np.array(data.getDataShape()['x_sd'])[dims])
            # stvt = np.vstack((dims, best.sensitivity() / xvar))
            # stvt = np.vstack((dims, best.sensitivity()))
            # stvt = stvt[:,stvt[1,:].argsort()[::-1]] # Sort in descending order
            # dims = stvt[0,:].astype(int).tolist()
            # stvt = stvt[1,:]

            s = "The best kernel that we have found relates the class label assignment " \
              + "to input " + dims2text(dims, data)
            if len(dims) > 1:
                s += ". In specific, the model involves "
                if len(summands) == 1:
                    s += prod2text(summands[0].getActiveDims(), data)
                else:
                    s += sum2text(summands)
            s += " ($" + best.latex() + "$). "
            doc.append(ut.NoEscape(s))

            s = "This model can achieve " \
              + r"a cross-validated classification error of {0:.2f}\% and ".format(best.error() * 100) \
              + r"a negative log marginal likelihood of {0:.2f}. ".format(best.getNLML())
            doc.append(ut.NoEscape(s))

            s = "\n\nWe have also analysed the relevance of each input variable. " \
              + "They are listed below in Table 1 " \
              + "in descending order of inferred relevance " \
              + "(i.e.~in ascending order of cross-validated error of " \
              + "the best one-dimensional GP classifier). "
            doc.append(ut.NoEscape(s))
            self.tabulateVariables()

            s = "\n\nIn the rest of the report, we will first describe the " \
              + "contribution of each individual input variable (Section 2). " \
              + "This is followed by a detailed analysis of the additive " \
              + "components that jointly make up the best model (Section 3). "
            doc.append(ut.NoEscape(s))
Exemple #5
0
    def makePreamble(self):
        doc = self.doc

        doc.packages.append(pl.Package('geometry', options=['margin=25mm']))
        doc.packages.append(pl.Package('hyperref'))
        doc.packages.append(pl.Package('babel', options=['UKenglish']))
        doc.packages.append(pl.Package('isodate', options=['UKenglish']))
        doc.packages.append(pl.Package('times'))
        doc.packages.append(pl.Package('siunitx'))
        doc.packages.append(
            ut.NoEscape(
                r'\sisetup{round-precision=2,round-mode=figures,scientific-notation=true}'
            ))

        title_str = r'AutoGPC Data Analysis Report on ' + self.name + ' Dataset'
        doc.preamble.append(pl.Command('title', ut.NoEscape(title_str)))
        doc.preamble.append(pl.Command('author', 'The Automatic Statistician'))
        doc.preamble.append(pl.Command('date', ut.NoEscape(r'\today')))
        doc.append(ut.NoEscape(r'\maketitle'))
Exemple #6
0
    def makeDataSummary(self):
        kern = self.history[1]
        doc = self.doc
        data = kern.data
        dataShape = data.getDataShape()

        npts = data.getNum()
        ndim = data.getDim()
        npos = data.getClass(1).shape[0]
        nneg = data.getClass(0).shape[0]

        imgName = 'data'
        imgFormat = '.eps' if ndim != 3 else '.png'
        imgOutName = imgName + imgFormat
        kern.draw(os.path.join(self.path, imgName), draw_posterior=False)

        # with doc.create(pl.Section("The Dataset")):
        s = "The training dataset spans {0} input dimensions, which are ".format(ndim) \
          + dims2text(range(ndim), data) + ". "
        doc.append(ut.NoEscape(s))

        s = "There is one binary output variable $y$, where " \
          + r"$y=0$ (negative) represents `{0}'".format(data.YLabel[0]) \
          + ", and " \
          + r"$y=1$ (positive) represents `{0}'".format(data.YLabel[1]) \
          + ". "
        doc.append(ut.NoEscape(s))

        s = "\n\nThe training dataset contains {0} data points. ".format(npts) \
          + r"Among them, {0} ({1:.2f}\%) have positive class labels, ".format(npos, npos / float(npts) * 100) \
          + r"and the other {0} ({1:.2f}\%) have negative labels. ".format(nneg, nneg / float(npts) * 100) \
          + "All input dimensions as well as the class label assignments " \
          + "are plotted in Figure {0}. ".format(self.fignum)
        doc.append(ut.NoEscape(s))

        with doc.create(pl.Figure(position='htbp!')) as fig:
            fig.add_image(imgOutName, width=ut.NoEscape(r'0.8\textwidth'))
            s = "The input dataset. " \
              + r"Positive samples (`{0}') are coloured red, ".format(data.YLabel[1]) \
              + r"and negative ones (`{0}') blue. ".format(data.YLabel[0])
            fig.add_caption(ut.NoEscape(s))
            self.fignum += 1
Exemple #7
0
    def describeOneAdditiveComponent(self, term):
        """
        Generate a subsection that describes one additive component.

        :param term: term to be analysed
        :type term: integer
        """
        ker, cum = self.kers[term - 1], self.cums[term - 1]
        data = ker.data
        kdims = ker.getActiveDims()
        error = cum.error()
        if term > 1: delta = self.cums[term - 2].error() - error
        nlml = cum.getNLML()

        doc = self.doc
        with doc.create(pl.Subsection("Component {0}".format(term))):
            if term == 1:
                s = r"With only one additive component, the GP classifier can achieve " \
                  + r"a cross-validated classification error of {0:.2f}\%. ".format(error * 100) \
                  + r"The corresponding negative log marginal likelihood is {0:.2f}. ".format(nlml)
                doc.append(ut.NoEscape(s))

                s = r"This component operates on " + dims2text(kdims, data) + ", " \
                  + r"as shown in Figure {0}. ".format(self.fignum)
                doc.append(ut.NoEscape(s))

            else:
                s = r"With {0} additive components, the cross-validated classification error ".format(term) \
                  + r"can be reduced by {0:.2f}\% to {1:.2f}\%. ".format(delta * 100, error * 100) \
                  + r"The corresponding negative log marginal likelihood is {0:.2f}. ".format(nlml)
                doc.append(ut.NoEscape(s))

                s = r"The additional component operates on " + dims2text(kdims, data) + ", " \
                  + r"as shown in Figure {0}. ".format(self.fignum)
                doc.append(ut.NoEscape(s))

            self.makeInteractionFigure(ker, cum, term)
Exemple #8
0
    def makeInteractionFigure(self, ker, cum, n_terms):
        """
        Create figure for interaction analysis, which includes a subfigure of
        the latest additive component and a subfigure of posterior of the overall
        compositional kernel up to now.

        :param ker: latest additive component to be plotted
        :type ker: GPCKernel
        :param cum: overall compositional kernel up to and including `ker`
        :type cum: GPCKernel
        :param n_terms: number of additive terms considered in `cum` so far
        """
        assert isinstance(ker, GPCKernel), 'Kernel must be of type GPCKernel'
        assert isinstance(cum, GPCKernel), 'Kernel must be of type GPCKernel'

        doc = self.doc
        kerDims = ker.getActiveDims()
        cumDims = cum.getActiveDims()

        img1Name = 'additive{0}ker'.format(n_terms)
        img1Format = '.eps' if len(kerDims) != 3 else '.png'
        img1Filename = img1Name + img1Format
        ker.draw(os.path.join(self.path, img1Name), active_dims_only=True)

        if n_terms == 1 or len(cumDims) > 3:
            # Only present current additive component
            caption_str = r"Trained classifier on " + dims2text(
                kerDims, ker.data) + "."
            with doc.create(pl.Figure(position='htbp!')) as fig:
                fig.add_image(img1Filename,
                              width=ut.NoEscape(r'0.7\textwidth'))
                fig.add_caption(ut.NoEscape(caption_str))
                self.fignum += 1

        else:
            # Present both current component and cumulative kernel
            img2Name = 'additive{0}cum'.format(n_terms)
            img2Format = '.eps' if len(cumDims) != 3 else '.png'
            img2Filename = img2Name + img2Format
            cum.draw(os.path.join(self.path, img2Name), active_dims_only=True)
            caption1_str = r"Current additive component involving " + dims2text(
                kerDims, ker.data) + "."
            caption2_str = r"Previous and current components combined, involving " + dims2text(
                cumDims, cum.data) + "."
            caption_str = r"Trained classifier on " + dims2text(
                cumDims, cum.data) + "."

            with doc.create(pl.Figure(position='htbp!')) as fig:
                with doc.create(
                        pl.SubFigure(
                            position='b',
                            width=ut.NoEscape(r'0.47\textwidth'))) as subfig1:
                    subfig1.add_image(img1Filename,
                                      width=ut.NoEscape(r'\textwidth'))
                    subfig1.add_caption(ut.NoEscape(caption1_str))
                doc.append(ut.NoEscape(r'\hfill'))
                with doc.create(
                        pl.SubFigure(
                            position='b',
                            width=ut.NoEscape(r'0.47\textwidth'))) as subfig2:
                    subfig2.add_image(img2Filename,
                                      width=ut.NoEscape(r'\textwidth'))
                    subfig2.add_caption(ut.NoEscape(caption2_str))
                fig.add_caption(ut.NoEscape(caption_str))
                self.fignum += 1
Exemple #9
0
    def tabulateAll(self):
        """
        Create a table that summarises all input variables and additive
        components, including the constant kernel as baseline and the final
        full additive model.
        """
        # 1-D variables
        ks = self.best1d[:]
        # Baseline: constant kernel
        ks.append(self.constker)
        # Additive components, if not 1-D
        for k in self.summands:
            if len(k.getActiveDims()) > 1:
                ks.append(k)
        # Full additive model, if involves more than one additive term
        best = self.history[-1]
        if len(self.summands) > 1:
            ks.append(best)

        ks.sort(key=lambda k: round(k.getNLML(), 2))
        ks.sort(key=lambda k: round(k.error(), 4))
        data = ks[0].data
        ds = data.getDataShape()

        nlml_min = round(min([k.getNLML() for k in ks]), 2)
        error_min = round(min([k.error() for k in ks]), 4)

        doc = self.doc
        with doc.create(pl.Table(position='htbp!')) as tab:
            caption_str = "Classification performance of the full model, its additive components (if any), all input variables, and the baseline."
            tab.add_caption(ut.NoEscape(caption_str))

            t = pl.Tabular('rlrr')
            # Header
            t.add_hline()
            t.add_row((pl.MultiColumn(1, align='c', data='Dimensions'),
                       pl.MultiColumn(1, align='c', data='Kernel expression'),
                       pl.MultiColumn(1, align='c', data='NLML'),
                       pl.MultiColumn(1, align='c', data='Error')))
            t.add_hline()

            # Entries
            for k in ks:
                if k is self.constker:
                    row = [
                        ut.italic('--', escape=False),
                        ut.italic('$' + k.latex() + '$ (Baseline)',
                                  escape=False),
                        ut.italic('{0:.2f}'.format(k.getNLML()), escape=False),
                        ut.italic(r'{0:.2f}\%'.format(k.error() * 100),
                                  escape=False)
                    ]
                else:
                    dims = sorted(k.getActiveDims())
                    row = [
                        ut.NoEscape(', '.join([str(d + 1) for d in dims])),
                        ut.NoEscape('$' + k.latex() + '$'),
                        ut.NoEscape('{0:.2f}'.format(k.getNLML())),
                        ut.NoEscape(r'{0:.2f}\%'.format(k.error() * 100))
                    ]
                if round(k.getNLML(), 2) == nlml_min:
                    row[2] = ut.bold(row[2])
                if round(k.error(), 4) == error_min:
                    row[3] = ut.bold(row[3])

                t.add_row(tuple(row))

            t.add_hline()

            tab.append(ut.NoEscape(r'\centering'))
            tab.append(t)
Exemple #10
0
    def tabulateVariables(self):
        """
        Create a table that summarises all input variables.
        """
        ks = self.best1d[:]
        ks.append(self.constker)
        ks.sort(key=lambda k: round(k.getNLML(), 2))
        ks.sort(key=lambda k: round(k.error(), 4))
        data = ks[0].data
        ds = data.getDataShape()

        nlml_min = round(min([k.getNLML() for k in ks]), 2)
        error_min = round(min([k.error() for k in ks]), 4)

        doc = self.doc
        with doc.create(pl.Table(position='htbp!')) as tab:
            tab.add_caption(ut.NoEscape("Input variables"))

            t = pl.Tabular('rlrrrcrr')
            # Header
            t.add_hline()
            t.add_row(('', '', pl.MultiColumn(3, align='c', data='Statistics'),
                       pl.MultiColumn(3,
                                      align='c',
                                      data='Classifier Performance')))
            t.add_hline(start=3, end=8)
            t.add_row((pl.MultiColumn(1, align='c', data='Dimension'),
                       pl.MultiColumn(1, align='c', data='Variable'),
                       pl.MultiColumn(1, align='c', data='Min'),
                       pl.MultiColumn(1, align='c', data='Max'),
                       pl.MultiColumn(1, align='c', data='Mean'),
                       pl.MultiColumn(1, align='c', data='Kernel'),
                       pl.MultiColumn(1, align='c', data='NLML'),
                       pl.MultiColumn(1, align='c', data='Error')))
            t.add_hline()

            # Entries
            for k in ks:
                if k is self.constker:
                    row = [
                        ut.italic('--', escape=False),
                        ut.italic('Baseline', escape=False),
                        ut.italic('--', escape=False),
                        ut.italic('--', escape=False),
                        ut.italic('--', escape=False),
                        ut.italic(k.shortInterp(), escape=False),
                        ut.italic('{0:.2f}'.format(k.getNLML()), escape=False),
                        ut.italic(r'{0:.2f}\%'.format(k.error() * 100),
                                  escape=False)
                    ]
                else:
                    dim = k.getActiveDims()[0]
                    row = [
                        dim + 1, data.XLabel[dim],
                        '{0:.2f}'.format(ds['x_min'][dim]),
                        '{0:.2f}'.format(ds['x_max'][dim]),
                        '{0:.2f}'.format(ds['x_mu'][dim]),
                        k.shortInterp(),
                        ut.NoEscape('{0:.2f}'.format(k.getNLML())),
                        ut.NoEscape(r'{0:.2f}\%'.format(k.error() * 100))
                    ]
                if round(k.getNLML(), 2) == nlml_min:
                    row[6] = ut.bold(row[6])
                if round(k.error(), 4) == error_min:
                    row[7] = ut.bold(row[7])

                t.add_row(tuple(row))

            t.add_hline()

            tab.append(ut.NoEscape(r'\centering'))
            tab.append(t)
Exemple #11
0
    def describeOneVariable(self, ker):
        """
        Generate a subsection describing a particular variable.

        :param ker:
        :type ker:
        """
        assert isinstance(ker, GPCKernel), 'Argument must be of type GPCKernel'
        assert len(
            ker.getActiveDims()) == 1, 'The kernel must be one-dimensional'

        dim = ker.getActiveDims()[0]
        data = ker.data
        ds = data.getDataShape()

        xmu, xsd, xmin, xmax = ds['x_mu'][dim], ds['x_sd'][dim], ds['x_min'][
            dim], ds['x_max'][dim]
        error = ker.error()
        mon = ker.monotonicity()
        per = ker.period()

        doc = self.doc
        with doc.create(
                pl.Subsection(ut.NoEscape(dims2text([dim], data, cap=True)))):
            # Routine description
            s = dims2text([dim], data, cap=True) + " has " \
              + "mean value {0:.2f} and standard deviation {1:.2f}. ".format(xmu, xsd) \
              + "Its observed minimum and maximum are {0:.2f} and {1:.2f} respectively. ".format(xmin, xmax) \
              + "A GP classifier trained on this variable alone can achieve " \
              + r"a cross-validated classification error of {0:.2f}\%. ".format(error * 100)

            # Significance
            s += "\n\n"
            e0 = float(self.constker.error())
            if error / e0 < 0.25:
                s += "Compared with the null model (baseline), this variable "
                s += "contains strong evidence whether the sample belongs to "
                s += "class `{0}'. ".format(data.YLabel[1])
            elif error / e0 < 0.8:
                s += "Compared with the null model (baseline), this variable "
                s += "contains some evidence of class label assignment. "
            elif error / e0 < 1.0:
                s += "This variable provides little evidence of class label "
                s += r"assignment given a baseline error rate of {0:.2f}\%. ".format(
                    e0 * 100)
            else:
                s += "The classification performance (in terms of error rate) "
                s += "based on this variable alone is even worse than "
                s += r'that of the na{\"i}ve baseline classifier. '

            # Monotonicity and periodicity - only do this for significant factors
            if error / e0 < 0.8:
                if mon != 0:
                    corr_str = "positive" if mon > 0 else "negative"
                    s += "There is a " + corr_str + " correlation between the value "
                    s += "of this variable and the likelihood of the sample being "
                    s += "classified as positive. "
                elif per != 0:
                    s += "The class assignment is approximately periodic with "
                    s += dims2text([dim], data) + ". "
                    s += "The period is about {0:.2f}. ".format(per)
                else:
                    s += "No significant monotonicity or periodicity "
                    s += "is associated with this variable. "

            s += "The GP posterior trained on this variable is plotted in Figure {0}. ".format(
                self.fignum)
            doc.append(ut.NoEscape(s))

            # Plotting
            imgName = 'var{0}'.format(dim)
            imgFormat = '.eps'
            imgFilename = imgName + imgFormat
            ker.draw(os.path.join(self.path, imgName), active_dims_only=True)
            caption_str = r"Trained classifier on " + dims2text([dim],
                                                                ker.data) + "."
            with doc.create(pl.Figure(position='htbp!')) as fig:
                fig.add_image(imgFilename, width=ut.NoEscape(r'0.7\textwidth'))
                fig.add_caption(ut.NoEscape(caption_str))
                self.fignum += 1