def describeAdditiveComponents(self): """ Generate a section describing all additive components present. """ n_terms = len(self.kers) error = self.cums[-1].error() doc = self.doc with doc.create(pl.Section("Additive Component Analysis")): terms_str = "only one additive component" if n_terms == 1 else "{0} additive components".format( n_terms) s = r"The pattern underlying the dataset can be decomposed into " \ + terms_str + ", " \ + r"which contribute jointly to the final classifier which we have trained. " \ + r"With all components in action, the classifier can achieve " \ + r"a cross-validated classification error rate of {0:.2f}\%. ".format(error * 100) \ + r"The performance cannot be further improved by adding more components. " doc.append(ut.NoEscape(s)) s = "\n\nIn Table 2 we list the full additive model, " \ + "all input variables, as well as " \ + "more complex additive components (if any) considered above, " \ + "ranked by their cross-validated error. " doc.append(ut.NoEscape(s)) self.tabulateAll() for i in range(1, n_terms + 1): self.describeOneAdditiveComponent(i)
def describeVariables(self): """ Generate a section describing all input dimensions / variables. """ n_terms = len(self.best1d) doc = self.doc with doc.create(pl.Section("Individual Variable Analysis")): s = "First, we try to classify the training samples using only one " \ + "of the {0:d} input dimensions. ".format(n_terms) \ + "By considering the best classification performance that is " \ + "achievable in each dimension, we are able to infer which " \ + "dimensions are the most relevant to the class label assignment. " doc.append(ut.NoEscape(s)) doc.append(ut.NoEscape("\n\n")) s = "Note that in Table 1 the baseline performance is also included, which is achieved " \ + "with a constant GP kernel. The effect of the baseline classifier " \ + "is to indiscriminately classify all samples as either positive " \ + "or negative, whichever is more frequent in the training data. " \ + "The classifier performance in each input dimension will be " \ + "compared against this baseline to tell if the input variable " \ + "is discriminative in terms of class determination. " doc.append(ut.NoEscape(s)) for i in range(n_terms): self.describeOneVariable(self.best1d[i])
def makeIntro(self): """ Make the leading paragraph. """ doc = self.doc s = "This report is automatically generated by the AutoGPC. " \ + "AutoGPC is an automatic Gaussian process (GP) classifier that " \ + "aims to find the structure underlying a dataset without human input. " \ + "For more information, please visit " \ + r"\url{http://github.com/charles92/autogpc}." doc.append(ut.NoEscape(s)) doc.append(ut.NoEscape("\n\n")) self.makeDataSummary()
def makeSummary(self): """ Generate the summary section. """ best = self.cums[-1] data = best.data summands = self.summands doc = self.doc with doc.create(pl.Section("Executive Summary")): dims = np.array(best.getActiveDims()) # TODO: Shall we normalise? # xvar = np.square(np.array(data.getDataShape()['x_sd'])[dims]) # stvt = np.vstack((dims, best.sensitivity() / xvar)) # stvt = np.vstack((dims, best.sensitivity())) # stvt = stvt[:,stvt[1,:].argsort()[::-1]] # Sort in descending order # dims = stvt[0,:].astype(int).tolist() # stvt = stvt[1,:] s = "The best kernel that we have found relates the class label assignment " \ + "to input " + dims2text(dims, data) if len(dims) > 1: s += ". In specific, the model involves " if len(summands) == 1: s += prod2text(summands[0].getActiveDims(), data) else: s += sum2text(summands) s += " ($" + best.latex() + "$). " doc.append(ut.NoEscape(s)) s = "This model can achieve " \ + r"a cross-validated classification error of {0:.2f}\% and ".format(best.error() * 100) \ + r"a negative log marginal likelihood of {0:.2f}. ".format(best.getNLML()) doc.append(ut.NoEscape(s)) s = "\n\nWe have also analysed the relevance of each input variable. " \ + "They are listed below in Table 1 " \ + "in descending order of inferred relevance " \ + "(i.e.~in ascending order of cross-validated error of " \ + "the best one-dimensional GP classifier). " doc.append(ut.NoEscape(s)) self.tabulateVariables() s = "\n\nIn the rest of the report, we will first describe the " \ + "contribution of each individual input variable (Section 2). " \ + "This is followed by a detailed analysis of the additive " \ + "components that jointly make up the best model (Section 3). " doc.append(ut.NoEscape(s))
def makePreamble(self): doc = self.doc doc.packages.append(pl.Package('geometry', options=['margin=25mm'])) doc.packages.append(pl.Package('hyperref')) doc.packages.append(pl.Package('babel', options=['UKenglish'])) doc.packages.append(pl.Package('isodate', options=['UKenglish'])) doc.packages.append(pl.Package('times')) doc.packages.append(pl.Package('siunitx')) doc.packages.append( ut.NoEscape( r'\sisetup{round-precision=2,round-mode=figures,scientific-notation=true}' )) title_str = r'AutoGPC Data Analysis Report on ' + self.name + ' Dataset' doc.preamble.append(pl.Command('title', ut.NoEscape(title_str))) doc.preamble.append(pl.Command('author', 'The Automatic Statistician')) doc.preamble.append(pl.Command('date', ut.NoEscape(r'\today'))) doc.append(ut.NoEscape(r'\maketitle'))
def makeDataSummary(self): kern = self.history[1] doc = self.doc data = kern.data dataShape = data.getDataShape() npts = data.getNum() ndim = data.getDim() npos = data.getClass(1).shape[0] nneg = data.getClass(0).shape[0] imgName = 'data' imgFormat = '.eps' if ndim != 3 else '.png' imgOutName = imgName + imgFormat kern.draw(os.path.join(self.path, imgName), draw_posterior=False) # with doc.create(pl.Section("The Dataset")): s = "The training dataset spans {0} input dimensions, which are ".format(ndim) \ + dims2text(range(ndim), data) + ". " doc.append(ut.NoEscape(s)) s = "There is one binary output variable $y$, where " \ + r"$y=0$ (negative) represents `{0}'".format(data.YLabel[0]) \ + ", and " \ + r"$y=1$ (positive) represents `{0}'".format(data.YLabel[1]) \ + ". " doc.append(ut.NoEscape(s)) s = "\n\nThe training dataset contains {0} data points. ".format(npts) \ + r"Among them, {0} ({1:.2f}\%) have positive class labels, ".format(npos, npos / float(npts) * 100) \ + r"and the other {0} ({1:.2f}\%) have negative labels. ".format(nneg, nneg / float(npts) * 100) \ + "All input dimensions as well as the class label assignments " \ + "are plotted in Figure {0}. ".format(self.fignum) doc.append(ut.NoEscape(s)) with doc.create(pl.Figure(position='htbp!')) as fig: fig.add_image(imgOutName, width=ut.NoEscape(r'0.8\textwidth')) s = "The input dataset. " \ + r"Positive samples (`{0}') are coloured red, ".format(data.YLabel[1]) \ + r"and negative ones (`{0}') blue. ".format(data.YLabel[0]) fig.add_caption(ut.NoEscape(s)) self.fignum += 1
def describeOneAdditiveComponent(self, term): """ Generate a subsection that describes one additive component. :param term: term to be analysed :type term: integer """ ker, cum = self.kers[term - 1], self.cums[term - 1] data = ker.data kdims = ker.getActiveDims() error = cum.error() if term > 1: delta = self.cums[term - 2].error() - error nlml = cum.getNLML() doc = self.doc with doc.create(pl.Subsection("Component {0}".format(term))): if term == 1: s = r"With only one additive component, the GP classifier can achieve " \ + r"a cross-validated classification error of {0:.2f}\%. ".format(error * 100) \ + r"The corresponding negative log marginal likelihood is {0:.2f}. ".format(nlml) doc.append(ut.NoEscape(s)) s = r"This component operates on " + dims2text(kdims, data) + ", " \ + r"as shown in Figure {0}. ".format(self.fignum) doc.append(ut.NoEscape(s)) else: s = r"With {0} additive components, the cross-validated classification error ".format(term) \ + r"can be reduced by {0:.2f}\% to {1:.2f}\%. ".format(delta * 100, error * 100) \ + r"The corresponding negative log marginal likelihood is {0:.2f}. ".format(nlml) doc.append(ut.NoEscape(s)) s = r"The additional component operates on " + dims2text(kdims, data) + ", " \ + r"as shown in Figure {0}. ".format(self.fignum) doc.append(ut.NoEscape(s)) self.makeInteractionFigure(ker, cum, term)
def makeInteractionFigure(self, ker, cum, n_terms): """ Create figure for interaction analysis, which includes a subfigure of the latest additive component and a subfigure of posterior of the overall compositional kernel up to now. :param ker: latest additive component to be plotted :type ker: GPCKernel :param cum: overall compositional kernel up to and including `ker` :type cum: GPCKernel :param n_terms: number of additive terms considered in `cum` so far """ assert isinstance(ker, GPCKernel), 'Kernel must be of type GPCKernel' assert isinstance(cum, GPCKernel), 'Kernel must be of type GPCKernel' doc = self.doc kerDims = ker.getActiveDims() cumDims = cum.getActiveDims() img1Name = 'additive{0}ker'.format(n_terms) img1Format = '.eps' if len(kerDims) != 3 else '.png' img1Filename = img1Name + img1Format ker.draw(os.path.join(self.path, img1Name), active_dims_only=True) if n_terms == 1 or len(cumDims) > 3: # Only present current additive component caption_str = r"Trained classifier on " + dims2text( kerDims, ker.data) + "." with doc.create(pl.Figure(position='htbp!')) as fig: fig.add_image(img1Filename, width=ut.NoEscape(r'0.7\textwidth')) fig.add_caption(ut.NoEscape(caption_str)) self.fignum += 1 else: # Present both current component and cumulative kernel img2Name = 'additive{0}cum'.format(n_terms) img2Format = '.eps' if len(cumDims) != 3 else '.png' img2Filename = img2Name + img2Format cum.draw(os.path.join(self.path, img2Name), active_dims_only=True) caption1_str = r"Current additive component involving " + dims2text( kerDims, ker.data) + "." caption2_str = r"Previous and current components combined, involving " + dims2text( cumDims, cum.data) + "." caption_str = r"Trained classifier on " + dims2text( cumDims, cum.data) + "." with doc.create(pl.Figure(position='htbp!')) as fig: with doc.create( pl.SubFigure( position='b', width=ut.NoEscape(r'0.47\textwidth'))) as subfig1: subfig1.add_image(img1Filename, width=ut.NoEscape(r'\textwidth')) subfig1.add_caption(ut.NoEscape(caption1_str)) doc.append(ut.NoEscape(r'\hfill')) with doc.create( pl.SubFigure( position='b', width=ut.NoEscape(r'0.47\textwidth'))) as subfig2: subfig2.add_image(img2Filename, width=ut.NoEscape(r'\textwidth')) subfig2.add_caption(ut.NoEscape(caption2_str)) fig.add_caption(ut.NoEscape(caption_str)) self.fignum += 1
def tabulateAll(self): """ Create a table that summarises all input variables and additive components, including the constant kernel as baseline and the final full additive model. """ # 1-D variables ks = self.best1d[:] # Baseline: constant kernel ks.append(self.constker) # Additive components, if not 1-D for k in self.summands: if len(k.getActiveDims()) > 1: ks.append(k) # Full additive model, if involves more than one additive term best = self.history[-1] if len(self.summands) > 1: ks.append(best) ks.sort(key=lambda k: round(k.getNLML(), 2)) ks.sort(key=lambda k: round(k.error(), 4)) data = ks[0].data ds = data.getDataShape() nlml_min = round(min([k.getNLML() for k in ks]), 2) error_min = round(min([k.error() for k in ks]), 4) doc = self.doc with doc.create(pl.Table(position='htbp!')) as tab: caption_str = "Classification performance of the full model, its additive components (if any), all input variables, and the baseline." tab.add_caption(ut.NoEscape(caption_str)) t = pl.Tabular('rlrr') # Header t.add_hline() t.add_row((pl.MultiColumn(1, align='c', data='Dimensions'), pl.MultiColumn(1, align='c', data='Kernel expression'), pl.MultiColumn(1, align='c', data='NLML'), pl.MultiColumn(1, align='c', data='Error'))) t.add_hline() # Entries for k in ks: if k is self.constker: row = [ ut.italic('--', escape=False), ut.italic('$' + k.latex() + '$ (Baseline)', escape=False), ut.italic('{0:.2f}'.format(k.getNLML()), escape=False), ut.italic(r'{0:.2f}\%'.format(k.error() * 100), escape=False) ] else: dims = sorted(k.getActiveDims()) row = [ ut.NoEscape(', '.join([str(d + 1) for d in dims])), ut.NoEscape('$' + k.latex() + '$'), ut.NoEscape('{0:.2f}'.format(k.getNLML())), ut.NoEscape(r'{0:.2f}\%'.format(k.error() * 100)) ] if round(k.getNLML(), 2) == nlml_min: row[2] = ut.bold(row[2]) if round(k.error(), 4) == error_min: row[3] = ut.bold(row[3]) t.add_row(tuple(row)) t.add_hline() tab.append(ut.NoEscape(r'\centering')) tab.append(t)
def tabulateVariables(self): """ Create a table that summarises all input variables. """ ks = self.best1d[:] ks.append(self.constker) ks.sort(key=lambda k: round(k.getNLML(), 2)) ks.sort(key=lambda k: round(k.error(), 4)) data = ks[0].data ds = data.getDataShape() nlml_min = round(min([k.getNLML() for k in ks]), 2) error_min = round(min([k.error() for k in ks]), 4) doc = self.doc with doc.create(pl.Table(position='htbp!')) as tab: tab.add_caption(ut.NoEscape("Input variables")) t = pl.Tabular('rlrrrcrr') # Header t.add_hline() t.add_row(('', '', pl.MultiColumn(3, align='c', data='Statistics'), pl.MultiColumn(3, align='c', data='Classifier Performance'))) t.add_hline(start=3, end=8) t.add_row((pl.MultiColumn(1, align='c', data='Dimension'), pl.MultiColumn(1, align='c', data='Variable'), pl.MultiColumn(1, align='c', data='Min'), pl.MultiColumn(1, align='c', data='Max'), pl.MultiColumn(1, align='c', data='Mean'), pl.MultiColumn(1, align='c', data='Kernel'), pl.MultiColumn(1, align='c', data='NLML'), pl.MultiColumn(1, align='c', data='Error'))) t.add_hline() # Entries for k in ks: if k is self.constker: row = [ ut.italic('--', escape=False), ut.italic('Baseline', escape=False), ut.italic('--', escape=False), ut.italic('--', escape=False), ut.italic('--', escape=False), ut.italic(k.shortInterp(), escape=False), ut.italic('{0:.2f}'.format(k.getNLML()), escape=False), ut.italic(r'{0:.2f}\%'.format(k.error() * 100), escape=False) ] else: dim = k.getActiveDims()[0] row = [ dim + 1, data.XLabel[dim], '{0:.2f}'.format(ds['x_min'][dim]), '{0:.2f}'.format(ds['x_max'][dim]), '{0:.2f}'.format(ds['x_mu'][dim]), k.shortInterp(), ut.NoEscape('{0:.2f}'.format(k.getNLML())), ut.NoEscape(r'{0:.2f}\%'.format(k.error() * 100)) ] if round(k.getNLML(), 2) == nlml_min: row[6] = ut.bold(row[6]) if round(k.error(), 4) == error_min: row[7] = ut.bold(row[7]) t.add_row(tuple(row)) t.add_hline() tab.append(ut.NoEscape(r'\centering')) tab.append(t)
def describeOneVariable(self, ker): """ Generate a subsection describing a particular variable. :param ker: :type ker: """ assert isinstance(ker, GPCKernel), 'Argument must be of type GPCKernel' assert len( ker.getActiveDims()) == 1, 'The kernel must be one-dimensional' dim = ker.getActiveDims()[0] data = ker.data ds = data.getDataShape() xmu, xsd, xmin, xmax = ds['x_mu'][dim], ds['x_sd'][dim], ds['x_min'][ dim], ds['x_max'][dim] error = ker.error() mon = ker.monotonicity() per = ker.period() doc = self.doc with doc.create( pl.Subsection(ut.NoEscape(dims2text([dim], data, cap=True)))): # Routine description s = dims2text([dim], data, cap=True) + " has " \ + "mean value {0:.2f} and standard deviation {1:.2f}. ".format(xmu, xsd) \ + "Its observed minimum and maximum are {0:.2f} and {1:.2f} respectively. ".format(xmin, xmax) \ + "A GP classifier trained on this variable alone can achieve " \ + r"a cross-validated classification error of {0:.2f}\%. ".format(error * 100) # Significance s += "\n\n" e0 = float(self.constker.error()) if error / e0 < 0.25: s += "Compared with the null model (baseline), this variable " s += "contains strong evidence whether the sample belongs to " s += "class `{0}'. ".format(data.YLabel[1]) elif error / e0 < 0.8: s += "Compared with the null model (baseline), this variable " s += "contains some evidence of class label assignment. " elif error / e0 < 1.0: s += "This variable provides little evidence of class label " s += r"assignment given a baseline error rate of {0:.2f}\%. ".format( e0 * 100) else: s += "The classification performance (in terms of error rate) " s += "based on this variable alone is even worse than " s += r'that of the na{\"i}ve baseline classifier. ' # Monotonicity and periodicity - only do this for significant factors if error / e0 < 0.8: if mon != 0: corr_str = "positive" if mon > 0 else "negative" s += "There is a " + corr_str + " correlation between the value " s += "of this variable and the likelihood of the sample being " s += "classified as positive. " elif per != 0: s += "The class assignment is approximately periodic with " s += dims2text([dim], data) + ". " s += "The period is about {0:.2f}. ".format(per) else: s += "No significant monotonicity or periodicity " s += "is associated with this variable. " s += "The GP posterior trained on this variable is plotted in Figure {0}. ".format( self.fignum) doc.append(ut.NoEscape(s)) # Plotting imgName = 'var{0}'.format(dim) imgFormat = '.eps' imgFilename = imgName + imgFormat ker.draw(os.path.join(self.path, imgName), active_dims_only=True) caption_str = r"Trained classifier on " + dims2text([dim], ker.data) + "." with doc.create(pl.Figure(position='htbp!')) as fig: fig.add_image(imgFilename, width=ut.NoEscape(r'0.7\textwidth')) fig.add_caption(ut.NoEscape(caption_str)) self.fignum += 1