Ejemplo n.º 1
0
 def test5(self):
     """ indicesToUse """
     probes = [
         (.5, 4, 2),
         (.7, 3, 3),
         (.75, 3, 3),
         (.333, 6, 0),
         (.25, 4, 2),
     ]
     nPts = len(self.d1)
     for frac, nKeep, nRej in probes:
         DataUtils.InitRandomNumbers((23, 42))
         k, r = DataUtils.FilterData(self.d1,
                                     1,
                                     frac,
                                     indicesToUse=range(nPts))
         assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep)
         assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej)
         keep, rej = k, r
         # make sure the examples are actually correct
         DataUtils.InitRandomNumbers((23, 42))
         tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac)
         assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep),
                                                   str(tgtKeep))
         assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej),
                                                 str(tgtRej))
Ejemplo n.º 2
0
 def test4_indicesOnly_indicesToUse(self):
     # """ indicesOnly with indicesToUse """
     probes = [
         (.5, 4, 2),
         (.7, 3, 3),
         (.75, 3, 3),
         (.333, 6, 0),
         (.25, 4, 2),
     ]
     nPts = len(self.d1)
     for frac, nKeep, nRej in probes:
         DataUtils.InitRandomNumbers((23, 42))
         k, r = DataUtils.FilterData(self.d1,
                                     1,
                                     frac,
                                     indicesToUse=range(nPts),
                                     indicesOnly=1)
         assert len(k) == nKeep, 'bad nKeep (%d != %d)' % (len(k), nKeep)
         assert len(r) == nRej, 'bad nRej (%d != %d)' % (len(r), nRej)
         # make sure the indices are actually correct
         keep = [self.d1[x] for x in k]
         rej = [self.d1[x] for x in r]
         DataUtils.InitRandomNumbers((23, 42))
         tgtKeep, tgtRej = DataUtils.FilterData(self.d1, 1, frac)
         assert keep == tgtKeep, '%.2f: %s!=%s' % (frac, str(keep),
                                                   str(tgtKeep))
         assert rej == tgtRej, '%.2f: %s!=%s' % (frac, str(rej),
                                                 str(tgtRej))
Ejemplo n.º 3
0
  def testPerm1(self):
    """ tests the descriptor remapping stuff in a packager """
    from rdkit.Chem import Descriptors
    with open(os.path.join(self.dataDir,'Jan9_build3_pkg.pkl'),'r') as pkgTF:
      buf = pkgTF.read().replace('\r\n', '\n').encode('utf-8')
      pkgTF.close()
    with io.BytesIO(buf) as pkgF:
      pkg = cPickle.load(pkgF)
    calc = pkg.GetCalculator()
    names = calc.GetDescriptorNames()
    ref = {}
    DataUtils.InitRandomNumbers((23,42))
    for smi,pred,conf in self.testD:
      for desc in names:
        fn = getattr(Descriptors,desc,lambda x:777)
        m = Chem.MolFromSmiles(smi)
        ref[desc] = fn(m)

      for i in range(5):
        perm = list(names)
        random.shuffle(perm,random=random.random)

        m = Chem.MolFromSmiles(smi)
        for desc in perm:
          fn = getattr(Descriptors,desc,lambda x:777)
          val = fn(m)
          assert feq(val,ref[desc],1e-4),'%s: %s(%s): %f!=%f'%(str(perm),
                                                               smi,
                                                               desc,
                                                               val,
                                                               ref[desc])
Ejemplo n.º 4
0
    def testPerm1(self):
        """ tests the descriptor remapping stuff in a packager """
        from rdkit.Chem import Descriptors
        pkg = cPickle.load(
            open(os.path.join(self.dataDir, 'Jan9_build3_pkg.pkl'), 'rb'))
        calc = pkg.GetCalculator()
        names = calc.GetDescriptorNames()
        ref = {}
        DataUtils.InitRandomNumbers((23, 42))
        for smi, pred, conf in self.testD:
            for desc in names:
                fn = getattr(Descriptors, desc, lambda x: 777)
                m = Chem.MolFromSmiles(smi)
                ref[desc] = fn(m)

            for i in range(5):
                perm = list(names)
                random.shuffle(perm)

                m = Chem.MolFromSmiles(smi)
                for desc in perm:
                    fn = getattr(Descriptors, desc, lambda x: 777)
                    val = fn(m)
                    assert feq(
                        val, ref[desc],
                        1e-4), '%s: %s(%s): %f!=%f' % (str(perm), smi, desc,
                                                       val, ref[desc])
Ejemplo n.º 5
0
    def setUp(self):
        # here is what we are going to do to test this out
        # - generate bit vectrs of length nbits
        # - turn on a fraction of the first nbits/2 bits at random
        # - for each bit i turned on in the range (0, nbits/2) turn on the bit
        #   nbits/2 + i
        # - basically the first half of a fingerprint is same as the second half of the
        #   fingerprint
        # - if we repeat this process often enough we whould see strong correlation between
        #   the bits i (i < nbits/2) and (nbits/2 + i)
        DataUtils.InitRandomNumbers((100, 23))
        self.nbits = 200
        self.d = 40
        self.nfp = 1000

        self.blist = range(self.nbits)

        self.fps = []
        for fi in range(self.nfp):
            fp = DataStructs.ExplicitBitVect(self.nbits)
            obits = range(self.nbits / 2)
            random.shuffle(obits)
            obits = obits[0:self.d]
            for bit in obits:
                fp.SetBit(bit)
                fp.SetBit(bit + self.nbits / 2)
            self.fps.append(fp)
Ejemplo n.º 6
0
    def testPerm1(self):
        # """ tests the descriptor remapping stuff in a packager """
        pkg = self._loadPackage()
        calc = pkg.GetCalculator()
        names = calc.GetDescriptorNames()
        ref = {}
        DataUtils.InitRandomNumbers((23, 42))
        for smi, _, _ in self.testD:
            for desc in names:
                fn = getattr(Descriptors, desc, lambda x: 777)
                m = Chem.MolFromSmiles(smi)
                ref[desc] = fn(m)

            for _ in range(5):
                perm = list(names)
                random.shuffle(perm, random=random.random)

                m = Chem.MolFromSmiles(smi)
                for desc in perm:
                    fn = getattr(Descriptors, desc, lambda x: 777)
                    val = fn(m)
                    assert feq(
                        val, ref[desc],
                        1e-4), '%s: %s(%s): %f!=%f' % (str(perm), smi, desc,
                                                       val, ref[desc])
Ejemplo n.º 7
0
 def testPerm2(self):
     # """ tests the descriptor remapping stuff in a packager """
     pkg = self._loadPackage()
     calc = pkg.GetCalculator()
     names = calc.GetDescriptorNames()
     DataUtils.InitRandomNumbers((23, 42))
     perm = list(names)
     random.shuffle(perm, random=random.random)
     calc.simpleList = perm
     calc.descriptorNames = perm
     pkg.Init()
     self._verify(pkg, self.testD)
Ejemplo n.º 8
0
 def testPerm2(self):
     """ tests the descriptor remapping stuff in a packager """
     pkg = cPickle.load(
         open(os.path.join(self.dataDir, 'Jan9_build3_pkg.pkl'), 'rb'))
     calc = pkg.GetCalculator()
     names = calc.GetDescriptorNames()
     DataUtils.InitRandomNumbers((23, 42))
     perm = list(names)
     random.shuffle(perm)
     calc.simpleList = perm
     calc.descriptorNames = perm
     pkg.Init()
     self._verify(pkg, self.testD)
Ejemplo n.º 9
0
    def test_SplitData(self):
        self.assertRaises(ValueError, SplitData.SplitDataSet, None, -1.1)
        self.assertRaises(ValueError, SplitData.SplitDataSet, None, 1.1)

        data = list(range(10))
        DataUtils.InitRandomNumbers((23, 42))
        f = StringIO()
        with redirect_stdout(f):
            result = SplitData.SplitDataSet(data, 0.5)
        self.assertEqual(set(result[0]).intersection(result[1]), set())
        self.assertEqual(len(result[0]), 5)
        s = f.getvalue()
        self.assertIn('Training', s)
        self.assertIn('hold-out', s)
Ejemplo n.º 10
0
 def setUp(self):
     #print '\n%s: '%self.shortDescription(),
     self.examples = cPickle.load(
         open(RDConfig.RDCodeDir + '/ML/Composite/test_data/ferro.pkl',
              'rb'))
     self.varNames = [
         'composition', 'max_atomic', 'has3d', 'has4d', 'has5d', 'elconc',
         'atvol', 'isferro'
     ]
     self.qBounds = [[], [1.89, 3.53], [], [], [], [0.55, 0.73],
                     [11.81, 14.52], []]
     self.nPoss = [0, 3, 2, 2, 2, 3, 3, 2]
     self.attrs = range(1, len(self.varNames) - 1)
     from rdkit.ML.Data import DataUtils
     DataUtils.InitRandomNumbers((23, 43))
Ejemplo n.º 11
0
 def testPerm2(self):
   """ tests the descriptor remapping stuff in a packager """
   with open(os.path.join(self.dataDir,'Jan9_build3_pkg.pkl'),'r') as pkgTF:
     buf = pkgTF.read().replace('\r\n', '\n').encode('utf-8')
     pkgTF.close()
   with io.BytesIO(buf) as pkgF:
     pkg = cPickle.load(pkgF)
   calc = pkg.GetCalculator()
   names = calc.GetDescriptorNames()
   DataUtils.InitRandomNumbers((23,42))
   perm = list(names)
   random.shuffle(perm,random=random.random)
   calc.simpleList = perm
   calc.descriptorNames = perm
   pkg.Init()
   self._verify(pkg,self.testD)
Ejemplo n.º 12
0
 def setUp(self):
     with open(RDConfig.RDCodeDir + '/ML/Composite/test_data/ferro.pkl',
               'r') as pklTF:
         buf = pklTF.read().replace('\r\n', '\n').encode('utf-8')
         pklTF.close()
     with io.BytesIO(buf) as pklF:
         self.examples = cPickle.load(pklF)
     self.varNames = [
         'composition', 'max_atomic', 'has3d', 'has4d', 'has5d', 'elconc',
         'atvol', 'isferro'
     ]
     self.qBounds = [[], [1.89, 3.53], [], [], [], [0.55, 0.73],
                     [11.81, 14.52], []]
     self.nPoss = [0, 3, 2, 2, 2, 3, 3, 2]
     self.attrs = list(range(1, len(self.varNames) - 1))
     from rdkit.ML.Data import DataUtils
     DataUtils.InitRandomNumbers((23, 43))
Ejemplo n.º 13
0
             import traceback
             print('problems with model %s:' % modelName)
             traceback.print_exc()
         else:
             models.append(model)
 nModels = len(models)
 pickVects = {}
 halfwayPts = [1e8] * len(models)
 for whichModel, model in enumerate(models):
     tmpD = dataSet
     try:
         seed = model._randomSeed
     except AttributeError:
         pass
     else:
         DataUtils.InitRandomNumbers(seed)
     if details.shuffleActivities:
         DataUtils.RandomizeActivities(tmpD, shuffle=1)
     if hasattr(model, '_splitFrac') and (details.doHoldout
                                          or details.doTraining):
         trainIdx, testIdx = SplitData.SplitIndices(tmpD.GetNPts(),
                                                    model._splitFrac,
                                                    silent=1)
         if details.filterFrac != 0.0:
             trainFilt, temp = DataUtils.FilterData(tmpD,
                                                    details.filterVal,
                                                    details.filterFrac,
                                                    -1,
                                                    indicesToUse=trainIdx,
                                                    indicesOnly=1)
             testIdx += temp
Ejemplo n.º 14
0
def RunOnData(details, data, progressCallback=None, saveIt=1, setDescNames=0):
    nExamples = data.GetNPts()
    if details.lockRandom:
        seed = details.randomSeed
    else:
        import random
        seed = (random.randint(0, 1e6), random.randint(0, 1e6))
    DataUtils.InitRandomNumbers(seed)
    testExamples = []
    if details.shuffleActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=1, runDetails=details)
    elif details.randomActivities == 1:
        DataUtils.RandomizeActivities(data, shuffle=0, runDetails=details)

    namedExamples = data.GetNamedData()
    if details.splitRun == 1:
        trainIdx, testIdx = SplitData.SplitIndices(len(namedExamples),
                                                   details.splitFrac,
                                                   silent=not _verbose)

        trainExamples = [namedExamples[x] for x in trainIdx]
        testExamples = [namedExamples[x] for x in testIdx]
    else:
        testExamples = []
        testIdx = []
        trainIdx = range(len(namedExamples))
        trainExamples = namedExamples

    if details.filterFrac != 0.0:
        # if we're doing quantization on the fly, we need to handle that here:
        if hasattr(details, 'activityBounds') and details.activityBounds:
            tExamples = []
            bounds = details.activityBounds
            for pt in trainExamples:
                pt = pt[:]
                act = pt[-1]
                placed = 0
                bound = 0
                while not placed and bound < len(bounds):
                    if act < bounds[bound]:
                        pt[-1] = bound
                        placed = 1
                    else:
                        bound += 1
                if not placed:
                    pt[-1] = bound
                tExamples.append(pt)
        else:
            bounds = None
            tExamples = trainExamples
        trainIdx, temp = DataUtils.FilterData(tExamples,
                                              details.filterVal,
                                              details.filterFrac,
                                              -1,
                                              indicesOnly=1)
        tmp = [trainExamples[x] for x in trainIdx]
        testExamples += [trainExamples[x] for x in temp]
        trainExamples = tmp

        counts = DataUtils.CountResults(trainExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in training set:')
        for k in ks:
            message(str((k, counts[k])))
        counts = DataUtils.CountResults(testExamples, bounds=bounds)
        ks = counts.keys()
        ks.sort()
        message('Result Counts in test set:')
        for k in ks:
            message(str((k, counts[k])))
    nExamples = len(trainExamples)
    message('Training with %d examples' % (nExamples))

    nVars = data.GetNVars()
    attrs = range(1, nVars + 1)
    nPossibleVals = data.GetNPossibleVals()
    for i in range(1, len(nPossibleVals)):
        if nPossibleVals[i - 1] == -1:
            attrs.remove(i)

    if details.pickleDataFileName != '':
        pickleDataFile = open(details.pickleDataFileName, 'wb+')
        cPickle.dump(trainExamples, pickleDataFile)
        cPickle.dump(testExamples, pickleDataFile)
        pickleDataFile.close()

    if details.bayesModel:
        composite = BayesComposite.BayesComposite()
    else:
        composite = Composite.Composite()

    composite._randomSeed = seed
    composite._splitFrac = details.splitFrac
    composite._shuffleActivities = details.shuffleActivities
    composite._randomizeActivities = details.randomActivities

    if hasattr(details, 'filterFrac'):
        composite._filterFrac = details.filterFrac
    if hasattr(details, 'filterVal'):
        composite._filterVal = details.filterVal

    composite.SetModelFilterData(details.modelFilterFrac,
                                 details.modelFilterVal)

    composite.SetActivityQuantBounds(details.activityBounds)
    nPossibleVals = data.GetNPossibleVals()
    if details.activityBounds:
        nPossibleVals[-1] = len(details.activityBounds) + 1

    if setDescNames:
        composite.SetInputOrder(data.GetVarNames())
        composite.SetDescriptorNames(details._descNames)
    else:
        composite.SetDescriptorNames(data.GetVarNames())
    composite.SetActivityQuantBounds(details.activityBounds)
    if details.nModels == 1:
        details.internalHoldoutFrac = 0.0
    if details.useTrees:
        from rdkit.ML.DecTree import CrossValidate, PruneTree
        if details.qBounds != []:
            from rdkit.ML.DecTree import BuildQuantTree
            builder = BuildQuantTree.QuantTreeBoot
        else:
            from rdkit.ML.DecTree import ID3
            builder = ID3.ID3Boot
        driver = CrossValidate.CrossValidationDriver
        pruner = PruneTree.PruneTree

        composite.SetQuantBounds(details.qBounds)
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       pruner=pruner,
                       nTries=details.nModels,
                       pruneIt=details.pruneIt,
                       lessGreedy=details.lessGreedy,
                       needsQuantization=0,
                       treeBuilder=builder,
                       nQuantBounds=details.qBounds,
                       startAt=details.startAt,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       silent=not _verbose)

    elif details.useSigTrees:
        from rdkit.ML.DecTree import CrossValidate
        from rdkit.ML.DecTree import BuildSigTree
        builder = BuildSigTree.SigTreeBuilder
        driver = CrossValidate.CrossValidationDriver
        nPossibleVals = data.GetNPossibleVals()
        if details.activityBounds:
            nPossibleVals[-1] = len(details.activityBounds) + 1
        if hasattr(details, 'sigTreeBiasList'):
            biasList = details.sigTreeBiasList
        else:
            biasList = None
        if hasattr(details, 'useCMIM'):
            useCMIM = details.useCMIM
        else:
            useCMIM = 0
        if hasattr(details, 'allowCollections'):
            allowCollections = details.allowCollections
        else:
            allowCollections = False
        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       treeBuilder=builder,
                       maxDepth=details.limitDepth,
                       progressCallback=progressCallback,
                       holdOutFrac=details.internalHoldoutFrac,
                       replacementSelection=details.replacementSelection,
                       recycleVars=details.recycleVars,
                       randomDescriptors=details.randomDescriptors,
                       biasList=biasList,
                       useCMIM=useCMIM,
                       allowCollection=allowCollections,
                       silent=not _verbose)

    elif details.useKNN:
        from rdkit.ML.KNN import CrossValidate
        from rdkit.ML.KNN import DistFunctions

        driver = CrossValidate.CrossValidationDriver
        dfunc = ''
        if (details.knnDistFunc == "Euclidean"):
            dfunc = DistFunctions.EuclideanDist
        elif (details.knnDistFunc == "Tanimoto"):
            dfunc = DistFunctions.TanimotoDist
        else:
            assert 0, "Bad KNN distance metric value"

        composite.Grow(trainExamples,
                       attrs,
                       nPossibleVals=[0] + nPossibleVals,
                       buildDriver=driver,
                       nTries=details.nModels,
                       needsQuantization=0,
                       numNeigh=details.knnNeighs,
                       holdOutFrac=details.internalHoldoutFrac,
                       distFunc=dfunc)

    elif details.useNaiveBayes or details.useSigBayes:
        from rdkit.ML.NaiveBayes import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        if not (hasattr(details, 'useSigBayes') and details.useSigBayes):
            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           mEstimateVal=details.mEstimateVal,
                           silent=not _verbose)
        else:
            if hasattr(details, 'useCMIM'):
                useCMIM = details.useCMIM
            else:
                useCMIM = 0

            composite.Grow(trainExamples,
                           attrs,
                           nPossibleVals=[0] + nPossibleVals,
                           buildDriver=driver,
                           nTries=details.nModels,
                           needsQuantization=0,
                           nQuantBounds=details.qBounds,
                           mEstimateVal=details.mEstimateVal,
                           useSigs=True,
                           useCMIM=useCMIM,
                           holdOutFrac=details.internalHoldoutFrac,
                           replacementSelection=details.replacementSelection,
                           silent=not _verbose)


##   elif details.useSVM:
##     from rdkit.ML.SVM import CrossValidate
##     driver = CrossValidate.CrossValidationDriver
##     composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
##                    buildDriver=driver, nTries=details.nModels,
##                    needsQuantization=0,
##                    cost=details.svmCost,gamma=details.svmGamma,
##                    weights=details.svmWeights,degree=details.svmDegree,
##                    type=details.svmType,kernelType=details.svmKernel,
##                    coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu,
##                    cache_size=details.svmCache,shrinking=details.svmShrink,
##                    dataType=details.svmDataType,
##                    holdOutFrac=details.internalHoldoutFrac,
##                    replacementSelection=details.replacementSelection,
##                    silent=not _verbose)

    else:
        from rdkit.ML.Neural import CrossValidate
        driver = CrossValidate.CrossValidationDriver
        composite.Grow(trainExamples,
                       attrs, [0] + nPossibleVals,
                       nTries=details.nModels,
                       buildDriver=driver,
                       needsQuantization=0)

    composite.AverageErrors()
    composite.SortModels()
    modelList, counts, avgErrs = composite.GetAllData()
    counts = numpy.array(counts)
    avgErrs = numpy.array(avgErrs)
    composite._varNames = data.GetVarNames()

    for i in range(len(modelList)):
        modelList[i].NameModel(composite._varNames)

    # do final statistics
    weightedErrs = counts * avgErrs
    averageErr = sum(weightedErrs) / sum(counts)
    devs = (avgErrs - averageErr)
    devs = devs * counts
    devs = numpy.sqrt(devs * devs)
    avgDev = sum(devs) / sum(counts)
    message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f' %
            (100. * averageErr, 100. * avgDev))

    if details.bayesModel:
        composite.Train(trainExamples, verbose=0)

    # blow out the saved examples and then save the composite:
    composite.ClearModelExamples()
    if saveIt:
        composite.Pickle(details.outName)
    details.model = DbModule.binaryHolder(cPickle.dumps(composite))

    badExamples = []
    if not details.detailedRes and (not hasattr(details, 'noScreen')
                                    or not details.noScreen):
        if details.splitRun:
            message('Testing all hold-out examples')
            wrong = testall(composite, testExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(testExamples))))
            _runDetails.holdout_error = float(len(wrong)) / len(testExamples)
        else:
            message('Testing all examples')
            wrong = testall(composite, namedExamples, badExamples)
            message('%d examples (%% %5.2f) were misclassified' %
                    (len(wrong),
                     100. * float(len(wrong)) / float(len(namedExamples))))
            _runDetails.overall_error = float(len(wrong)) / len(namedExamples)

    if details.detailedRes:
        message('\nEntire data set:')
        resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()), data,
                                                 composite, nPossibleVals[-1],
                                                 details.threshold)
        nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
        nPts = len(namedExamples)
        nClass = nGood + nBad
        _runDetails.overall_error = float(nBad) / nClass
        _runDetails.overall_correct_conf = avgGood
        _runDetails.overall_incorrect_conf = avgBad
        _runDetails.overall_result_matrix = repr(voteTab)
        nRej = nClass - nPts
        if nRej > 0:
            _runDetails.overall_fraction_dropped = float(nRej) / nPts

        if details.splitRun:
            message('\nHold-out data:')
            resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)),
                                                     testExamples, composite,
                                                     nPossibleVals[-1],
                                                     details.threshold)
            nGood, nBad, nSkip, avgGood, avgBad, avgSkip, voteTab = resTup
            nPts = len(testExamples)
            nClass = nGood + nBad
            _runDetails.holdout_error = float(nBad) / nClass
            _runDetails.holdout_correct_conf = avgGood
            _runDetails.holdout_incorrect_conf = avgBad
            _runDetails.holdout_result_matrix = repr(voteTab)
            nRej = nClass - nPts
            if nRej > 0:
                _runDetails.holdout_fraction_dropped = float(nRej) / nPts

    if details.persistTblName and details.dbName:
        message('Updating results table %s:%s' %
                (details.dbName, details.persistTblName))
        details.Store(db=details.dbName, table=details.persistTblName)

    if details.badName != '':
        badFile = open(details.badName, 'w+')
        for i in range(len(badExamples)):
            ex = badExamples[i]
            vote = wrong[i]
            outStr = '%s\t%s\n' % (ex, vote)
            badFile.write(outStr)
        badFile.close()

    composite.ClearModelExamples()
    return composite
Ejemplo n.º 15
0
def BalanceComposite(details,composite,data1=None,data2=None):
  """ balances the composite using the parameters provided in details

   **Arguments**

     - details a _CompositeRun.RunDetails_ object

     - composite: the composite model to be balanced

     - data1: (optional) if provided, this should be the
       data set used to construct the original models

     - data2: (optional) if provided, this should be the
       data set used to construct the new individual models

  """
  if not details.balCnt or details.balCnt > len(composite):
    return composite
  message("Balancing Composite")

  #
  # start by getting data set 1: which is the data set used to build the
  #  original models
  #
  if data1 is None:
    message("\tReading First Data Set")
    fName = details.balTable.strip()
    tmp = details.tableName
    details.tableName = fName
    dbName = details.dbName
    details.dbName = details.balDb
    data1 = details.GetDataSet()
    details.tableName = tmp
    details.dbName = dbName
  if data1 is None:
    return composite
  details.splitFrac = composite._splitFrac
  details.randomSeed = composite._randomSeed
  DataUtils.InitRandomNumbers(details.randomSeed)
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details)
  namedExamples = data1.GetNamedData()
  if details.balDoHoldout or details.balDoTrain:
    trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac,
                                              silent=1)
    trainExamples = [namedExamples[x] for x in trainIdx]
    testExamples = [namedExamples[x] for x in testIdx]
    if details.filterFrac != 0.0:
      trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal,
                                           details.filterFrac,-1,
                                           indicesOnly=1)
      tmp = [trainExamples[x] for x in trainIdx]
      testExamples += [trainExamples[x] for x in temp]
      trainExamples = tmp
    if details.balDoHoldout:
      testExamples,trainExamples = trainExamples,testExamples
  else:
    trainExamples = namedExamples
  dataSet1 = trainExamples
  cols1 = [x.upper() for x in data1.GetVarNames()]
  data1 = None

  #
  # now grab data set 2: the data used to build the new individual models
  #
  if data2 is None:
    message("\tReading Second Data Set")
    data2 = details.GetDataSet()
  if data2 is None:
    return composite
  details.splitFrac = composite._splitFrac
  details.randomSeed = composite._randomSeed
  DataUtils.InitRandomNumbers(details.randomSeed)
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details)
  dataSet2 = data2.GetNamedData()
  cols2 = [x.upper() for x in data2.GetVarNames()]
  data2 = None

  # and balance it:
  res = []
  weights = details.balWeight
  if type(weights) not in (types.TupleType,types.ListType):
    weights = (weights,)
  for weight in weights:
    message("\tBalancing with Weight: %.4f"%(weight))
    res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2,
                                                weight,
                                                details.balCnt,
                                                names1=cols1,names2=cols2))
  return res
Ejemplo n.º 16
0
def GrowIt(details,composite,progressCallback=None,
           saveIt=1,setDescNames=0,data=None):
  """ does the actual work of building a composite model

    **Arguments**

      - details:  a _CompositeRun.CompositeRun_ object containing details
        (options, parameters, etc.) about the run

      - composite: the composite model to grow
      
      - progressCallback: (optional) a function which is called with a single
        argument (the number of models built so far) after each model is built.

      - saveIt: (optional) if this is nonzero, the resulting model will be pickled
        and dumped to the filename specified in _details.outName_

      - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
        will be called using the results of the data set's _GetVarNames()_ method;
        it is assumed that the details object has a _descNames attribute which
        is passed to the composites _SetDescriptorNames()_ method.  Otherwise
        (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.

      - data: (optional) the data set to be used.  If this is not provided, the
        data set described in details will be used.
        
    **Returns**

      the enlarged composite model


  """
  details.rundate = time.asctime()

  if data is None:
    fName = details.tableName.strip()
    if details.outName == '':
      details.outName = fName + '.pkl'
    if details.dbName == '':
      data = DataUtils.BuildQuantDataSet(fName)
    elif details.qBounds != []:
      details.tableName = fName
      data = details.GetDataSet()
    else:
      data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName,
                                     user=details.dbUser,password=details.dbPassword)

  nExamples = data.GetNPts()
  seed = composite._randomSeed
  DataUtils.InitRandomNumbers(seed)
  testExamples = [] 
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details)

  namedExamples = data.GetNamedData()
  trainExamples = namedExamples
  nExamples = len(trainExamples)
  message('Training with %d examples'%(nExamples))
  message('\t%d descriptors'%(len(trainExamples[0])-2))
  nVars = data.GetNVars()
  nPossibleVals = composite.nPossibleVals
  attrs = range(1,nVars+1)

  if details.useTrees:
    from rdkit.ML.DecTree import CrossValidate,PruneTree
    if details.qBounds != []:
      from rdkit.ML.DecTree import BuildQuantTree
      builder = BuildQuantTree.QuantTreeBoot
    else:
      from rdkit.ML.DecTree import ID3
      builder = ID3.ID3Boot
    driver = CrossValidate.CrossValidationDriver
    pruner = PruneTree.PruneTree

    if setDescNames:
      composite.SetInputOrder(data.GetVarNames())
    composite.Grow(trainExamples,attrs,[0]+nPossibleVals,
                   buildDriver=driver,
                   pruner=pruner,
                   nTries=details.nModels,pruneIt=details.pruneIt,
                   lessGreedy=details.lessGreedy,needsQuantization=0,
                   treeBuilder=builder,nQuantBounds=details.qBounds,
                   startAt=details.startAt,
                   maxDepth=details.limitDepth,
                   progressCallback=progressCallback,
                   silent=not _verbose)


  else:
    from rdkit.ML.Neural import CrossValidate
    driver = CrossValidate.CrossValidationDriver
    composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels,
                   buildDriver=driver,needsQuantization=0)
    
  composite.AverageErrors()
  composite.SortModels()
  modelList,counts,avgErrs = composite.GetAllData()
  counts = numpy.array(counts)
  avgErrs = numpy.array(avgErrs)
  composite._varNames = data.GetVarNames()

  for i in range(len(modelList)):
    modelList[i].NameModel(composite._varNames)

  # do final statistics
  weightedErrs = counts*avgErrs
  averageErr = sum(weightedErrs)/sum(counts)
  devs = (avgErrs - averageErr)
  devs = devs * counts
  devs = numpy.sqrt(devs*devs)
  avgDev = sum(devs)/sum(counts)
  if _verbose:
    message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev))
  
  if details.bayesModel:
    composite.Train(trainExamples,verbose=0)

  badExamples = []
  if not details.detailedRes:
    if _verbose:
      message('Testing all examples')
    wrong = BuildComposite.testall(composite,namedExamples,badExamples)
    if _verbose:
      message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples))))
    _runDetails.overall_error = float(len(wrong))/len(namedExamples)

  if details.detailedRes:
    if _verbose:
      message('\nEntire data set:')
    resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite,
                                             nPossibleVals[-1],details.threshold)
    nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup
    nPts = len(namedExamples)
    nClass = nGood+nBad
    _runDetails.overall_error = float(nBad) / nClass
    _runDetails.overall_correct_conf = avgGood
    _runDetails.overall_incorrect_conf = avgBad
    _runDetails.overall_result_matrix = repr(voteTab)
    nRej = nClass-nPts
    if nRej > 0:
      _runDetails.overall_fraction_dropped = float(nRej)/nPts
      
  return composite
Ejemplo n.º 17
0
 def setUp(self):
     DataUtils.InitRandomNumbers((25, 25))