Exemple #1
0
    def test2XValClass(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'KNN', 'test_data',
                             'random_pts.csv')
        data = DataUtils.TextFileToData(fName)
        examples = data.GetNamedData()
        npvals = data.GetNPossibleVals()
        nvars = data.GetNVars()
        attrs = list(range(1, nvars + 1))
        numNeigh = 11
        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       numNeigh,
                                                       silent=1)
        self.assertAlmostEqual(err, 0.01075, 4)

        neighborList = []
        res = mod.ClassifyExample(examples[0], neighborList=neighborList)
        self.assertEqual(res, 1)
        self.assertEqual(neighborList[0][1], examples[0])

        self.assertEqual(mod.GetName(), '')
        mod.SetName('name')
        self.assertEqual(mod.GetName(), 'name')
        self.assertEqual(mod.type(), 'Classification Model')
        mod.NameModel('this argument is ignored')
        self.assertEqual(mod.GetName(), 'Classification Model')
Exemple #2
0
    def test2NaiveBayes(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes',
                             'test_data', 'stddata.csv')
        data = DataUtils.TextFileToData(fName)
        examples = data.GetNamedData()

        nvars = data.GetNVars()
        attrs = list(range(1, nvars + 1))
        npvals = [0] + [3] * nvars + [2]
        qBounds = [0] + [2] * nvars + [0]
        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       qBounds,
                                                       mEstimateVal=20.0,
                                                       silent=True)
        self.assertTrue(isinstance(mod, NaiveBayesClassifier))
        self.assertAlmostEqual(err, 0.1818, 4)

        self.assertEqual(mod.GetName(), '')
        mod.SetName('modelName')
        self.assertEqual(mod.GetName(), 'modelName')
        mod.NameModel(None)
        self.assertEqual(mod.GetName(), 'NaiveBayesClassifier')

        self.assertGreater(len(mod.GetExamples()), 0)
        self.assertGreater(len(mod.GetTrainingExamples()), 0)
        self.assertEqual(sorted(mod.GetTrainingExamples() + mod.GetExamples()),
                         sorted(examples))
Exemple #3
0
    def test1NaiveBayes(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes',
                             'test_data', 'stddata.csv')
        data = DataUtils.TextFileToData(fName)

        examples = data.GetNamedData()

        nvars = data.GetNVars()
        attrs = range(1, nvars + 1)
        npvals = [0] + [3] * nvars + [2]
        qBounds = [0] + [2] * nvars + [0]
        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       qBounds,
                                                       silent=True)

        self.assertAlmostEqual(mod._classProbs[0], 0.5000, 4)
        self.assertAlmostEqual(mod._classProbs[1], 0.5000, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][0], -0.0360, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114)
        self.assertAlmostEqual(mod._QBoundVals[2][0], -0.7022, 4)
        self.assertAlmostEqual(mod._QBoundVals[2][1], -0.16635, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][0], -0.3659, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4)

        self.assertAlmostEqual(err, 0.2121, 4)
Exemple #4
0
    def test1NaiveBayes(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes',
                             'test_data', 'stddata.csv')
        data = DataUtils.TextFileToData(fName)

        examples = data.GetNamedData()

        nvars = data.GetNVars()
        attrs = list(range(1, nvars + 1))
        npvals = [0] + [3] * nvars + [2]
        qBounds = [0] + [2] * nvars + [0]
        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       qBounds,
                                                       silent=True)
        self.assertAlmostEqual(mod._classProbs[0], 0.5000, 4)
        self.assertAlmostEqual(mod._classProbs[1], 0.5000, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][0], -0.0360, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114)
        self.assertAlmostEqual(mod._QBoundVals[2][0], -0.7022, 4)
        self.assertAlmostEqual(mod._QBoundVals[2][1], -0.16635, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][0], -0.3659, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4)
        self.assertAlmostEqual(err, 0.2121, 4)

        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       qBounds,
                                                       silent=True,
                                                       calcTotalError=True)
        self.assertAlmostEqual(mod._classProbs[0], 0.515151, 4)
        self.assertAlmostEqual(mod._classProbs[1], 0.484848, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][0], -0.40315, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][1], 0.114)
        self.assertAlmostEqual(mod._QBoundVals[2][0], -0.62185, 4)
        self.assertAlmostEqual(mod._QBoundVals[2][1], -0.19965, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][0], 0.4305, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][1], 0.80305, 4)
        self.assertAlmostEqual(err, 0.14563, 4)

        mod, err = CrossValidate.CrossValidationDriver(
            examples,
            attrs,
            npvals,
            qBounds,
            silent=True,
            replacementSelection=True)
        self.assertAlmostEqual(mod._classProbs[0], 0.5131578, 4)
        self.assertAlmostEqual(mod._classProbs[1], 0.4868421, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][0], -0.036, 4)
        self.assertAlmostEqual(mod._QBoundVals[1][1], 0.93465, 4)
        self.assertAlmostEqual(mod._QBoundVals[2][0], -0.6696, 4)
        self.assertAlmostEqual(mod._QBoundVals[2][1], -0.19965, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][0], -1.06785, 4)
        self.assertAlmostEqual(mod._QBoundVals[3][1], 0.4305, 4)
        self.assertAlmostEqual(err, 0.3, 4)
Exemple #5
0
def RunIt(details, progressCallback=None, saveIt=1, setDescNames=0):
    """ does the actual work of building a composite model

    **Arguments**

      - details:  a _CompositeRun.CompositeRun_ object containing details
        (options, parameters, etc.) about the run

      - progressCallback: (optional) a function which is called with a single
        argument (the number of models built so far) after each model is built.

      - saveIt: (optional) if this is nonzero, the resulting model will be pickled
        and dumped to the filename specified in _details.outName_

      - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
        will be called using the results of the data set's _GetVarNames()_ method;
        it is assumed that the details object has a _descNames attribute which
        is passed to the composites _SetDescriptorNames()_ method.  Otherwise
        (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.

    **Returns**

      the composite model constructed


  """
    details.rundate = time.asctime()

    fName = details.tableName.strip()
    if details.outName == '':
        details.outName = fName + '.pkl'
    if not details.dbName:
        if details.qBounds != []:
            data = DataUtils.TextFileToData(fName)
        else:
            data = DataUtils.BuildQuantDataSet(fName)
    elif details.useSigTrees or details.useSigBayes:
        details.tableName = fName
        data = details.GetDataSet(pickleCol=0,
                                  pickleClass=DataStructs.ExplicitBitVect)
    elif details.qBounds != [] or not details.useTrees:
        details.tableName = fName
        data = details.GetDataSet()
    else:
        data = DataUtils.DBToQuantData(
            details.dbName,  # Function no longer defined
            fName,
            quantName=details.qTableName,
            user=details.dbUser,
            password=details.dbPassword)

    composite = RunOnData(details,
                          data,
                          progressCallback=progressCallback,
                          saveIt=saveIt,
                          setDescNames=setDescNames)
    return composite
Exemple #6
0
 def test2XValClass(self):
   fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv')
   data = DataUtils.TextFileToData(fName)
   examples = data.GetNamedData()
   npvals = data.GetNPossibleVals()
   nvars = data.GetNVars()
   attrs = range(1,nvars+1)
   numNeigh = 11
   mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, numNeigh,silent=1)
   self.assertAlmostEqual(err,0.01075,4)
Exemple #7
0
 def test4XValRegress(self):
   fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv')
   data = DataUtils.TextFileToData(fName)
   examples = data.GetNamedData()
   npvals = data.GetNPossibleVals()
   nvars = data.GetNVars()
   attrs = range(1,nvars+1)
   numNeigh = 11
   mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals, numNeigh,silent=1,
                                                  modelBuilder=CrossValidate.makeRegressionModel)
   # NOTE: this number hasn't been extensively checked
   self.assertAlmostEqual(err,0.0777,4)
Exemple #8
0
    def test2NaiveBayes(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes',
                             'test_data', 'stddata.csv')
        data = DataUtils.TextFileToData(fName)
        examples = data.GetNamedData()

        nvars = data.GetNVars()
        attrs = range(1, nvars + 1)
        npvals = [0] + [3] * nvars + [2]
        qBounds = [0] + [2] * nvars + [0]
        mod, err = CrossValidate.CrossValidationDriver(examples,
                                                       attrs,
                                                       npvals,
                                                       qBounds,
                                                       mEstimateVal=20.0)

        assert feq(err, 0.19354)
Exemple #9
0
 def test1Neighbors(self):
   fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','random_pts.csv')
   data = DataUtils.TextFileToData(fName)
   examples = data.GetNamedData()
   npvals = data.GetNPossibleVals()
   nvars = data.GetNVars()
   attrs = range(1,nvars+1)
   numNeigh = 11
   metric = DistFunctions.EuclideanDist
   mdl = KNNModel.KNNModel(numNeigh,attrs,metric)
   pt = examples.pop(0)
   tgt = [(metric(pt,ex,attrs),ex) for ex in examples]
   tgt.sort()
   mdl.SetTrainingExamples(examples)
   neighbors = mdl.GetNeighbors(pt)
   for i in range(numNeigh):
     assert feq(-tgt[i][0],neighbors[i][0])
     assert tgt[i][1][0]==neighbors[i][1][0]
Exemple #10
0
    def test3Regress(self):
        # """ a carefully laid out regression data set where the results are clear: """
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'KNN', 'test_data',
                             'sample_pts.csv')
        data = DataUtils.TextFileToData(fName)
        examples = data.GetNamedData()
        nvars = data.GetNVars()
        attrs = list(range(1, nvars + 1))
        numNeigh = 4
        metric = DistFunctions.EuclideanDist
        mdl = KNNRegressionModel.KNNRegressionModel(numNeigh, attrs, metric)
        mdl.SetTrainingExamples(examples)

        res = mdl.PredictExample([4, -3.5, 2.5, 0])
        assert feq(res, 1.25)
        res = mdl.PredictExample([4, 3, 2, 0])
        assert feq(res, 1.5)
        res = mdl.PredictExample([4, 3, -2.5, 0])
        assert feq(res, -1.5)
        # Use a distance dependent weight for the neighbours
        res = mdl.PredictExample([4, 3, -2.5, 0], weightedAverage=True)
        self.assertAlmostEqual(res, -1.6)
        # Check the case that the example is identical to one of the neighbours (distance = 0)
        neighborList = []
        res = mdl.PredictExample(examples[0],
                                 weightedAverage=True,
                                 neighborList=neighborList)
        self.assertAlmostEqual(res, 1.5857864)
        self.assertEqual(neighborList[0][1], examples[0])

        self.assertEqual(mdl.GetBadExamples(), [])

        self.assertEqual(mdl.GetName(), '')
        mdl.SetName('name')
        self.assertEqual(mdl.GetName(), 'name')
        self.assertEqual(mdl.type(), 'Regression Model')
        mdl.NameModel('this argument is ignored')
        self.assertEqual(mdl.GetName(), 'Regression Model')

        self.assertEqual(
            sorted(mdl.GetTrainingExamples() + mdl.GetTestExamples()),
            sorted(examples))
Exemple #11
0
  def test3Regress(self):
    """ a carefully laid out regression data set where the results are clear:

    """
    fName = os.path.join(RDConfig.RDCodeDir,'ML','KNN','test_data','sample_pts.csv')
    data = DataUtils.TextFileToData(fName)
    examples = data.GetNamedData()
    npvals = data.GetNPossibleVals()
    nvars = data.GetNVars()
    attrs = range(1,nvars+1)
    numNeigh = 4
    metric = DistFunctions.EuclideanDist
    mdl = KNNRegressionModel.KNNRegressionModel(numNeigh,attrs,metric)
    mdl.SetTrainingExamples(examples)

    res = mdl.PredictExample([4,-3.5,2.5,0])
    assert feq(res,1.25)
    res = mdl.PredictExample([4,3,2,0])
    assert feq(res,1.5)
    res = mdl.PredictExample([4,3,-2.5,0])
    assert feq(res,-1.5)
Exemple #12
0
    def test1NaiveBayes(self):
        fName = os.path.join(RDConfig.RDCodeDir, 'ML', 'NaiveBayes',
                             'test_data', 'stddata.csv')
        data = DataUtils.TextFileToData(fName)

        examples = data.GetNamedData()

        nvars = data.GetNVars()
        attrs = range(1, nvars + 1)
        npvals = [0] + [3] * nvars + [2]
        qBounds = [0] + [2] * nvars + [0]
        mod, err = CrossValidate.CrossValidationDriver(examples, attrs, npvals,
                                                       qBounds)

        assert feq(mod._classProbs[0], 0.54167)
        assert feq(mod._classProbs[1], 0.45833)
        assert feq(mod._QBoundVals[1][0], -0.56995)
        assert feq(mod._QBoundVals[1][1], 0.114)
        assert feq(mod._QBoundVals[2][0], -0.7022)
        assert feq(mod._QBoundVals[2][1], -0.2347)
        assert feq(mod._QBoundVals[3][0], -0.3659)
        assert feq(mod._QBoundVals[3][1], 1.17275)

        assert feq(err, 0.16129)
Exemple #13
0
def FingerprintsFromDetails(details, reportFreq=10):
    data = None
    if details.dbName and details.tableName:
        from rdkit.Dbase.DbConnection import DbConnect
        from rdkit.Dbase import DbInfo
        from rdkit.ML.Data import DataUtils
        try:
            conn = DbConnect(details.dbName, details.tableName)
        except Exception:
            import traceback
            error('Problems establishing connection to database: %s|%s\n' %
                  (details.dbName, details.tableName))
            traceback.print_exc()
        if not details.idName:
            details.idName = DbInfo.GetColumnNames(details.dbName,
                                                   details.tableName)[0]
        dataSet = DataUtils.DBToData(details.dbName,
                                     details.tableName,
                                     what='%s,%s' %
                                     (details.idName, details.smilesName))
        idCol = 0
        smiCol = 1
    elif details.inFileName and details.useSmiles:
        from rdkit.ML.Data import DataUtils
        conn = None
        if not details.idName:
            details.idName = 'ID'
        try:
            dataSet = DataUtils.TextFileToData(
                details.inFileName,
                onlyCols=[details.idName, details.smilesName])
        except IOError:
            import traceback
            error('Problems reading from file %s\n' % (details.inFileName))
            traceback.print_exc()

        idCol = 0
        smiCol = 1
    elif details.inFileName and details.useSD:
        conn = None
        dataset = None
        if not details.idName:
            details.idName = 'ID'
        dataSet = []
        try:
            s = Chem.SDMolSupplier(details.inFileName)
        except Exception:
            import traceback
            error('Problems reading from file %s\n' % (details.inFileName))
            traceback.print_exc()
        else:
            while 1:
                try:
                    m = s.next()
                except StopIteration:
                    break
                if m:
                    dataSet.append(m)
                    if reportFreq > 0 and not len(dataSet) % reportFreq:
                        message('Read %d molecules\n' % (len(dataSet)))
                        if details.maxMols > 0 and len(
                                dataSet) >= details.maxMols:
                            break

        for i, mol in enumerate(dataSet):
            if mol.HasProp(details.idName):
                nm = mol.GetProp(details.idName)
            else:
                nm = mol.GetProp('_Name')
            dataSet[i] = (nm, mol)
    else:
        dataSet = None

    fps = None
    if dataSet and not details.useSD:
        data = dataSet.GetNamedData()
        if not details.molPklName:
            fps = FingerprintsFromSmiles(data, idCol, smiCol,
                                         **details.__dict__)
        else:
            fps = FingerprintsFromPickles(data, idCol, smiCol,
                                          **details.__dict__)
    elif dataSet and details.useSD:
        fps = FingerprintsFromMols(dataSet, **details.__dict__)

    if fps:
        if details.outFileName:
            outF = open(details.outFileName, 'wb+')
            for i in range(len(fps)):
                pickle.dump(fps[i], outF)
            outF.close()
        dbName = details.outDbName or details.dbName
        if details.outTableName and dbName:
            from rdkit.Dbase.DbConnection import DbConnect
            from rdkit.Dbase import DbUtils, DbModule
            conn = DbConnect(dbName)
            #
            #  We don't have a db open already, so we'll need to figure out
            #    the types of our columns...
            #
            colTypes = DbUtils.TypeFinder(data, len(data), len(data[0]))
            typeStrs = DbUtils.GetTypeStrings(
                [details.idName, details.smilesName],
                colTypes,
                keyCol=details.idName)
            cols = '%s, %s %s' % (typeStrs[0], details.fpColName,
                                  DbModule.binaryTypeName)

            # FIX: we should really check to see if the table
            #  is already there and, if so, add the appropriate
            #  column.

            #
            # create the new table
            #
            if details.replaceTable or \
               details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
                conn.AddTable(details.outTableName, cols)

            #
            # And add the data
            #
            for ID, fp in fps:
                tpl = ID, DbModule.binaryHolder(fp.ToBinary())
                conn.InsertData(details.outTableName, tpl)
            conn.Commit()
    return fps