def MakePlot(details, final, counts, pickVects, nModels, nTrueActs=-1): if not hasattr(details, 'plotFile') or not details.plotFile: return dataFileName = '%s.dat' % (details.plotFile) outF = open(dataFileName, 'w+') i = 0 while i < len(final) and counts[i] != 0: if nModels > 1: _, sd = Stats.MeanAndDev(pickVects[i]) confInterval = Stats.GetConfidenceInterval(sd, len(pickVects[i]), level=90) outF.write('%d %f %f %d %f\n' % (i + 1, final[i][0] / counts[i], final[i][1] / counts[i], counts[i], confInterval)) else: outF.write('%d %f %f %d\n' % (i + 1, final[i][0] / counts[i], final[i][1] / counts[i], counts[i])) i += 1 outF.close() plotFileName = '%s.gnu' % (details.plotFile) gnuF = open(plotFileName, 'w+') gnuHdr = """# Generated by EnrichPlot.py version: %s set size square 0.7 set xr [0:] set data styl points set ylab 'Num Correct Picks' set xlab 'Num Picks' set grid set nokey set term postscript enh color solid "Helvetica" 16 set term X """ % (__VERSION_STRING) print(gnuHdr, file=gnuF) if nTrueActs > 0: print('set yr [0:%d]' % nTrueActs, file=gnuF) print('plot x with lines', file=gnuF) if nModels > 1: everyGap = i / 20 print('replot "%s" using 1:2 with lines,' % (dataFileName), end='', file=gnuF) print('"%s" every %d using 1:2:5 with yerrorbars' % (dataFileName, everyGap), file=gnuF) else: print('replot "%s" with points' % (dataFileName), file=gnuF) gnuF.close() if hasattr(details, 'showPlot') and details.showPlot: try: from Gnuplot import Gnuplot p = Gnuplot() p('load "%s"' % (plotFileName)) input('press return to continue...\n') except Exception: import traceback traceback.print_exc()
def testTransform2(self): """ testing that rotation of points into PCA frame doesn't change dot products """ self.d = numpy.array( [[1, 2.068703704, 2.040555556, 2.068703704, 2.141782407, 7.46], [2, 1.48537037, -0.186756425, 1.48537037, 1.803819444, 8.16], [3, 1.917469136, 0.785465797, 1.917469136, 2.046875, 8.68], [4, 2.068703704, 1.125743575, 2.068703704, 2.131944444, 8.89], [5, 2.138703704, 1.283243575, 2.138703704, 2.171319444, 9.25], [6, 2.152037037, 1.313243575, 2.152037037, 2.178819444, 9.3], [7, 1.730740741, 1.457222222, -0.179901738, 1.558449074, 7.52], [8, 1.973796296, 1.889320988, 0.792320484, 1.99054784, 8.16], [9, 2.058865741, 2.040555556, 1.132598262, 2.141782407, 8.3], [10, 2.098240741, 2.110555556, 1.290098262, 2.211782407, 8.4], [11, 2.105740741, 2.123888889, 1.320098262, 2.225115741, 8.46], [12, 1.390462963, -0.37502803, 0.171950113, 1.652584877, 8.19], [13, 1.475532407, -0.223793462, 0.512227891, 1.803819444, 8.57], [14, 1.522407407, -0.140460128, 0.699727891, 1.887152778, 8.82], [15, 1.822561728, 0.597194192, 0.604048879, 1.895640432, 8.89], [16, 1.907631173, 0.74842876, 0.944326657, 2.046875, 8.92], [17, 1.954506173, 0.831762094, 1.131826657, 2.130208333, 8.96], [18, 1.973796296, 0.93747197, 0.755283447, 1.980709877, 9], [19, 2.058865741, 1.088706538, 1.095561224, 2.131944444, 9.35], [20, 2.105740741, 1.172039872, 1.283061224, 2.215277778, 9.22], [21, 2.189074074, 1.359539872, 1.366394558, 2.262152778, 9.3], [22, 2.142199074, 1.276206538, 1.178894558, 2.178819444, 9.52]], 'd') self.d = self.d[:, 1:-2] eVals, eVects = Stats.PrincipalComponents(self.d) pts = Stats.TransformPoints(eVects, self.d) avg = sum(self.d) / len(self.d) self.d -= avg for i in range(len(pts)): for j in range(len(pts)): vi = self.d[i] vi /= numpy.sqrt(numpy.dot(vi, vi)) vj = self.d[j] vj /= numpy.sqrt(numpy.dot(vj, vj)) pvi = pts[i] pvi /= numpy.sqrt(numpy.dot(pvi, pvi)) pvj = pts[j] pvj /= numpy.sqrt(numpy.dot(pvj, pvj)) assert feq(numpy.dot(vi, vj), numpy.dot( pvi, pvj)), 'bad dot: %4.4f %4.4f' % (numpy.dot( vi, vj), numpy.dot(pvi, pvj))
def testCorrelation(self): # " test the correlation matrix calculation " m = Stats.FormCorrelationMatrix(self.d) target = numpy.array([[1., 0.66865732, -0.10131374], [0.66865732, 1., -0.28792771], [-0.10131374, -0.28792771, 1.]]) diff = abs(m - target) assert max(diff.ravel()) < FLOAT_TOL, 'correlation matrix incorrect'
def StdDev(mat): """ the standard deviation classifier This uses _ML.Data.Stats.StandardizeMatrix()_ to do the work """ return Stats.StandardizeMatrix(mat)
def testTransform(self): " test transformation to PCA frame " eVals, eVects = Stats.PrincipalComponents(self.d) pts = Stats.TransformPoints(eVects, self.d) p0 = numpy.array([-1.12488653, -1.84061768, -0.1294482]) p3 = numpy.array([-3.82295273, -3.09754194, 0.24549203]) p5 = numpy.array([2.29785176, 3.4726933, -0.36094115]) assert max(abs(pts[0] - p0)) < FLOAT_TOL, 'p0 comparison failed %s!=%s' % (str( pts[0]), str(p0)) assert max(abs(pts[3] - p3)) < FLOAT_TOL, 'p3 comparison failed %s!=%s' % (str( pts[3]), str(p3)) assert max(abs(pts[5] - p5)) < FLOAT_TOL, 'p5 comparison failed %s!=%s' % (str( pts[5]), str(p5))
def testPCA(self): # " test the PCA calculation " eVals, eVects = Stats.PrincipalComponents(self.d) tVals = numpy.array([1.76877414, 0.92707592, 0.30414995]) tVect = numpy.array([[-0.64200458, -0.66321742, 0.38467229], [0.34166917, 0.20166619, 0.91792861], [-0.68636164, 0.72074503, 0.09713033]]) assert max((abs(eVals - tVals)).ravel()) < FLOAT_TOL, 'bad variances from PCA' for i in range(eVects.shape[0]): assert (max((abs(eVects[i] - tVect[i])).ravel()) < FLOAT_TOL or max((abs(eVects[i] + tVect[i])).ravel()) < FLOAT_TOL), 'bad vectors from PCA'
def testTransform(self): # " test transformation to PCA frame " _, eVects = Stats.PrincipalComponents(self.d) pts = Stats.TransformPoints(eVects, self.d) refPs = [numpy.array([-1.20362098, -1.79265006, 0.08776266]), numpy.array([4.63540648, 1.1669869, 0.47026415]), numpy.array([0.8709456, -0.50012821, 0.24763993]), numpy.array([-3.94140499, -2.88350573, 0.64863041]), numpy.array([-0.97015382, 2.42239972, 0.51066736]), numpy.array([2.43084762, 3.3115892, -0.77094542]), numpy.array([0.74360559, -2.67765459, 0.73974091]), numpy.array([-1.2274861, 3.6819975, -0.07856395]), numpy.array([-0.4342764, 0.04320715, 0.28202332]), numpy.array([-0.903863, -2.77224188, -2.13721937])] p0 = refPs[0] p3 = refPs[3] p5 = refPs[5] assert max(abs(pts[0] - p0)) < FLOAT_TOL, 'p0 comparison failed %s!=%s' % (str(pts[0]), str(p0)) assert max(abs(pts[3] - p3)) < FLOAT_TOL, 'p3 comparison failed %s!=%s' % (str(pts[3]), str(p3)) assert max(abs(pts[5] - p5)) < FLOAT_TOL, 'p5 comparison failed %s!=%s' % (str(pts[5]), str(p5))
def testCovariance(self): # """ test the covariance x calculation # test case from http://www.itl.nist.gov/div898/handbook/pmc/section5/pmc541.htm # """ d = numpy.array([[4., 2, 0.6], [4.2, 2.1, 0.59], [3.9, 2.0, 0.58], [4.3, 2.1, 0.62], [4.1, 2.2, 0.63]]) m = Stats.FormCovarianceMatrix(d) target = numpy.array([[0.025, 0.0075, 0.00175], [0.0075, 0.007, 0.00135], [0.00175, 0.00135, 0.00043]]) diff = abs(m - target) assert max(diff.ravel()) < FLOAT_TOL, 'covariance matrix incorrect'
nModels, nTrueActs=nTrueActives) else: if nModels > 1: print( '#Index\tAvg_num_correct\tConf90Pct\tAvg_num_picked\tNum_picks\tlast_selection' ) else: print( '#Index\tAvg_num_correct\tAvg_num_picked\tNum_picks\tlast_selection' ) i = 0 while i < nPts and counts[i] != 0: if nModels > 1: mean, sd = Stats.MeanAndDev(pickVects[i]) confInterval = Stats.GetConfidenceInterval(sd, len(pickVects[i]), level=90) print('%d\t%f\t%f\t%f\t%d\t%s' % (i + 1, final[i][0] / counts[i], confInterval, final[i][1] / counts[i], counts[i], str(selPts[i]))) else: print('%d\t%f\t%f\t%d\t%s' % (i + 1, final[i][0] / counts[i], final[i][1] / counts[i], counts[i], str(selPts[i]))) i += 1 mean, sd = Stats.MeanAndDev(halfwayPts) print('Halfway point: %.2f(%.2f)' % (mean, sd))
def EmbedOne(mol, name, match, pcophore, count=1, silent=0, **kwargs): """ generates statistics for a molecule's embeddings Four energies are computed for each embedding: 1) E1: the energy (with constraints) of the initial embedding 2) E2: the energy (with constraints) of the optimized embedding 3) E3: the energy (no constraints) the geometry for E2 4) E4: the energy (no constraints) of the optimized free-molecule (starting from the E3 geometry) Returns a 9-tuple: 1) the mean value of E1 2) the sample standard deviation of E1 3) the mean value of E2 4) the sample standard deviation of E2 5) the mean value of E3 6) the sample standard deviation of E3 7) the mean value of E4 8) the sample standard deviation of E4 9) The number of embeddings that failed """ global _times atomMatch = [list(x.GetAtomIds()) for x in match] bm, ms, nFailed = EmbedPharmacophore(mol, atomMatch, pcophore, count=count, silent=silent, **kwargs) e1s = [] e2s = [] e3s = [] e4s = [] d12s = [] d23s = [] d34s = [] for m in ms: t1 = time.time() try: e1, e2 = OptimizeMol(m, bm, atomMatch) except ValueError: pass else: t2 = time.time() _times['opt1'] = _times.get('opt1', 0) + t2 - t1 e1s.append(e1) e2s.append(e2) d12s.append(e1 - e2) t1 = time.time() try: e3, e4 = OptimizeMol(m, bm) except ValueError: pass else: t2 = time.time() _times['opt2'] = _times.get('opt2', 0) + t2 - t1 e3s.append(e3) e4s.append(e4) d23s.append(e2 - e3) d34s.append(e3 - e4) count += 1 try: e1, e1d = Stats.MeanAndDev(e1s) except Exception: e1 = -1.0 e1d = -1.0 try: e2, e2d = Stats.MeanAndDev(e2s) except Exception: e2 = -1.0 e2d = -1.0 try: e3, e3d = Stats.MeanAndDev(e3s) except Exception: e3 = -1.0 e3d = -1.0 try: e4, e4d = Stats.MeanAndDev(e4s) except Exception: e4 = -1.0 e4d = -1.0 if not silent: print('%s(%d): %.2f(%.2f) -> %.2f(%.2f) : %.2f(%.2f) -> %.2f(%.2f)' % (name, nFailed, e1, e1d, e2, e2d, e3, e3d, e4, e4d)) return e1, e1d, e2, e2d, e3, e3d, e4, e4d, nFailed
def ErrorStats(conn, where, enrich=1): fields = ( 'overall_error,holdout_error,overall_result_matrix,' + 'holdout_result_matrix,overall_correct_conf,overall_incorrect_conf,' + 'holdout_correct_conf,holdout_incorrect_conf') try: data = conn.GetData(fields=fields, where=where) except Exception: import traceback traceback.print_exc() return None nPts = len(data) if not nPts: sys.stderr.write('no runs found\n') return None overall = numpy.zeros(nPts, numpy.float) overallEnrich = numpy.zeros(nPts, numpy.float) oCorConf = 0.0 oInCorConf = 0.0 holdout = numpy.zeros(nPts, numpy.float) holdoutEnrich = numpy.zeros(nPts, numpy.float) hCorConf = 0.0 hInCorConf = 0.0 overallMatrix = None holdoutMatrix = None for i in range(nPts): if data[i][0] is not None: overall[i] = data[i][0] oCorConf += data[i][4] oInCorConf += data[i][5] if data[i][1] is not None: holdout[i] = data[i][1] haveHoldout = 1 else: haveHoldout = 0 tmpOverall = 1. * eval(data[i][2]) if enrich >= 0: overallEnrich[i] = ScreenComposite.CalcEnrichment(tmpOverall, tgt=enrich) if haveHoldout: tmpHoldout = 1. * eval(data[i][3]) if enrich >= 0: holdoutEnrich[i] = ScreenComposite.CalcEnrichment(tmpHoldout, tgt=enrich) if overallMatrix is None: if data[i][2] is not None: overallMatrix = tmpOverall if haveHoldout and data[i][3] is not None: holdoutMatrix = tmpHoldout else: overallMatrix += tmpOverall if haveHoldout: holdoutMatrix += tmpHoldout if haveHoldout: hCorConf += data[i][6] hInCorConf += data[i][7] avgOverall = sum(overall) / nPts oCorConf /= nPts oInCorConf /= nPts overallMatrix /= nPts oSort = numpy.argsort(overall) oMin = overall[oSort[0]] overall -= avgOverall devOverall = numpy.sqrt(sum(overall**2) / (nPts - 1)) res = {} res['oAvg'] = 100 * avgOverall res['oDev'] = 100 * devOverall res['oCorrectConf'] = 100 * oCorConf res['oIncorrectConf'] = 100 * oInCorConf res['oResultMat'] = overallMatrix res['oBestIdx'] = oSort[0] res['oBestErr'] = 100 * oMin if enrich >= 0: mean, dev = Stats.MeanAndDev(overallEnrich) res['oAvgEnrich'] = mean res['oDevEnrich'] = dev if haveHoldout: avgHoldout = sum(holdout) / nPts hCorConf /= nPts hInCorConf /= nPts holdoutMatrix /= nPts hSort = numpy.argsort(holdout) hMin = holdout[hSort[0]] holdout -= avgHoldout devHoldout = numpy.sqrt(sum(holdout**2) / (nPts - 1)) res['hAvg'] = 100 * avgHoldout res['hDev'] = 100 * devHoldout res['hCorrectConf'] = 100 * hCorConf res['hIncorrectConf'] = 100 * hInCorConf res['hResultMat'] = holdoutMatrix res['hBestIdx'] = hSort[0] res['hBestErr'] = 100 * hMin if enrich >= 0: mean, dev = Stats.MeanAndDev(holdoutEnrich) res['hAvgEnrich'] = mean res['hDevEnrich'] = dev return res