Example #1
0
def RandomizeMolBlock(molB):
    splitB = molB.split('\n')
    res = []
    res.extend(splitB[0:3])
    idx = 3
    inL = splitB[idx]
    res.append(inL)
    nAts = int(inL[0:3])
    nBonds = int(inL[3:6])

    idx += 1
    atLines = splitB[idx:idx + nAts]

    order = list(range(nAts))
    random.shuffle(order, random=random.random)

    for i in order:
        res.append(atLines[i])

    #print 'ORDER:',order
    idx += nAts
    for i in range(nBonds):
        inL = splitB[idx]
        idx1 = int(inL[0:3]) - 1
        idx2 = int(inL[3:6]) - 1
        idx1 = order.index(idx1)
        idx2 = order.index(idx2)
        inL = '% 3d% 3d' % (idx1 + 1, idx2 + 1) + inL[6:]
        res.append(inL)
        idx += 1
    res.append('M  END')
    return '\n'.join(res)
Example #2
0
def RandomizeMolBlock(molB):
  splitB = molB.split('\n')
  res = []
  res.extend(splitB[0:3])
  idx = 3
  inL = splitB[idx]
  res.append(inL)
  nAts = int(inL[0:3])
  nBonds = int(inL[3:6])

  idx+=1
  atLines = splitB[idx:idx+nAts]

  order = list(range(nAts))
  random.shuffle(order,random=random.random)

  for i in order:
    res.append(atLines[i])

  #print 'ORDER:',order
  idx += nAts
  for i in range(nBonds):
    inL = splitB[idx]
    idx1 = int(inL[0:3])-1
    idx2 = int(inL[3:6])-1
    idx1 = order.index(idx1)
    idx2 = order.index(idx2)
    inL = '% 3d% 3d'%(idx1+1,idx2+1)+inL[6:]
    res.append(inL)
    idx += 1
  res.append('M  END')
  return '\n'.join(res)
Example #3
0
def _testSpecific():
  from rdkit.ML.DecTree import ID3
  oPts= [ \
    [0,0,1,0],
    [0,1,1,1],
    [1,0,1,1],
    [1,1,0,0],
    [1,1,1,1],
    ]
  tPts = oPts+[[0,1,1,0],[0,1,1,0]]

  tree = ID3.ID3Boot(oPts,attrs=range(3),nPossibleVals=[2]*4)
  tree.Print()
  err,badEx = CrossValidate.CrossValidate(tree,oPts)
  print('original error:',err)


  err,badEx = CrossValidate.CrossValidate(tree,tPts)
  print('original holdout error:',err)
  newTree,frac2 = PruneTree(tree,oPts,tPts)
  newTree.Print()
  err,badEx = CrossValidate.CrossValidate(newTree,tPts)
  print('pruned holdout error is:',err)
  print(badEx)

  print(len(tree),len(newTree))
Example #4
0
def _testChain():
    from rdkit.ML.DecTree import ID3
    oPts= [ \
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [1,0,0,0,1],
      [0,0,1,1,0],
      [0,0,1,1,0],
      [0,0,1,1,1],
      [0,1,0,1,0],
      [0,1,0,1,0],
      [0,1,0,0,1],
      ]
    tPts = oPts

    tree = ID3.ID3Boot(oPts,
                       attrs=range(len(oPts[0]) - 1),
                       nPossibleVals=[2] * len(oPts[0]))
    tree.Print()
    err, badEx = CrossValidate.CrossValidate(tree, oPts)
    print('original error:', err)

    err, badEx = CrossValidate.CrossValidate(tree, tPts)
    print('original holdout error:', err)
    newTree, frac2 = PruneTree(tree, oPts, tPts)
    newTree.Print()
    err, badEx = CrossValidate.CrossValidate(newTree, tPts)
    print('pruned holdout error is:', err)
    print(badEx)
Example #5
0
def TestQuantTree():  # pragma: nocover
    """ Testing code for named trees

  The created pkl file is required by the unit test code.
  """
    examples1 = [['p1', 0, 1, 0.1, 0], ['p2', 0, 0, 0.1, 1],
                 ['p3', 0, 0, 1.1, 2], ['p4', 0, 1, 1.1, 2],
                 ['p5', 1, 0, 0.1, 2], ['p6', 1, 0, 1.1, 2],
                 ['p7', 1, 1, 0.1, 2], ['p8', 1, 1, 1.1, 0]]
    attrs = list(range(1, len(examples1[0]) - 1))
    nPossibleVals = [0, 2, 2, 0, 3]
    boundsPerVar = [0, 0, 0, 1, 0]

    print('base')
    t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar)
    t1.Pickle('test_data/QuantTree1.pkl')
    t1.Print()

    print('depth limit')
    t1 = QuantTreeBoot(examples1,
                       attrs,
                       nPossibleVals,
                       boundsPerVar,
                       maxDepth=1)
    t1.Pickle('test_data/QuantTree1.pkl')
    t1.Print()
Example #6
0
def TestQuantTree():
  """ testing code for named trees

  """
  examples1 = [['p1',0,1,0.1,0],
              ['p2',0,0,0.1,1],
              ['p3',0,0,1.1,2],
              ['p4',0,1,1.1,2],
              ['p5',1,0,0.1,2],
              ['p6',1,0,1.1,2],
              ['p7',1,1,0.1,2],
              ['p8',1,1,1.1,0]
              ]
  attrs = list(range(1,len(examples1[0])-1))
  nPossibleVals = [0,2,2,0,3]
  boundsPerVar=[0,0,0,1,0]
  
  print('base')
  t1 = QuantTreeBoot(examples1,attrs,nPossibleVals,boundsPerVar)
  t1.Pickle('test_data/QuantTree1.pkl')
  t1.Print()

  print('depth limit')
  t1 = QuantTreeBoot(examples1,attrs,nPossibleVals,boundsPerVar,maxDepth=1)
  t1.Pickle('test_data/QuantTree1.pkl')
  t1.Print()
Example #7
0
def _testChain():
  from rdkit.ML.DecTree import ID3
  oPts = [
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [1, 0, 0, 0, 1],
    [0, 0, 1, 1, 0],
    [0, 0, 1, 1, 0],
    [0, 0, 1, 1, 1],
    [0, 1, 0, 1, 0],
    [0, 1, 0, 1, 0],
    [0, 1, 0, 0, 1],
  ]
  tPts = oPts

  tree = ID3.ID3Boot(oPts, attrs=range(len(oPts[0]) - 1), nPossibleVals=[2] * len(oPts[0]))
  tree.Print()
  err, _ = CrossValidate.CrossValidate(tree, oPts)
  print('original error:', err)

  err, _ = CrossValidate.CrossValidate(tree, tPts)
  print('original holdout error:', err)
  newTree, frac2 = PruneTree(tree, oPts, tPts)
  newTree.Print()
  print('best error of pruned tree:', frac2)
  err, badEx = CrossValidate.CrossValidate(newTree, tPts)
  print('pruned holdout error is:', err)
  print(badEx)
Example #8
0
def CheckCanonicalization(mol, nReps=10):
    refSmi = Chem.MolToSmiles(mol, False)
    for i in range(nReps):
        m2 = RandomizeMol(mol)
        smi = Chem.MolToSmiles(m2, False)
        if smi != refSmi:
            raise ValueError('\nRef: %s\n   : %s' % (refSmi, smi))
Example #9
0
def CheckCanonicalization(mol,nReps=10):
  refSmi = Chem.MolToSmiles(mol,False)
  for i in range(nReps):
    m2 = RandomizeMol(mol)
    smi = Chem.MolToSmiles(m2,False)
    if smi!=refSmi:
      raise ValueError('\nRef: %s\n   : %s'%(refSmi,smi))
Example #10
0
def _testSpecific():
    from rdkit.ML.DecTree import ID3
    oPts= [ \
      [0,0,1,0],
      [0,1,1,1],
      [1,0,1,1],
      [1,1,0,0],
      [1,1,1,1],
      ]
    tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]]

    tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4)
    tree.Print()
    err, badEx = CrossValidate.CrossValidate(tree, oPts)
    print('original error:', err)

    err, badEx = CrossValidate.CrossValidate(tree, tPts)
    print('original holdout error:', err)
    newTree, frac2 = PruneTree(tree, oPts, tPts)
    newTree.Print()
    err, badEx = CrossValidate.CrossValidate(newTree, tPts)
    print('pruned holdout error is:', err)
    print(badEx)

    print(len(tree), len(newTree))
Example #11
0
def TestQuantTree():
    """ testing code for named trees

  """
    examples1 = [['p1', 0, 1, 0.1, 0], ['p2', 0, 0, 0.1, 1],
                 ['p3', 0, 0, 1.1, 2], ['p4', 0, 1, 1.1, 2],
                 ['p5', 1, 0, 0.1, 2], ['p6', 1, 0, 1.1, 2],
                 ['p7', 1, 1, 0.1, 2], ['p8', 1, 1, 1.1, 0]]
    attrs = list(range(1, len(examples1[0]) - 1))
    nPossibleVals = [0, 2, 2, 0, 3]
    boundsPerVar = [0, 0, 0, 1, 0]

    print('base')
    t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar)
    t1.Pickle('test_data/QuantTree1.pkl')
    t1.Print()

    print('depth limit')
    t1 = QuantTreeBoot(examples1,
                       attrs,
                       nPossibleVals,
                       boundsPerVar,
                       maxDepth=1)
    t1.Pickle('test_data/QuantTree1.pkl')
    t1.Print()
Example #12
0
def RandomizeMolBlock(molB):
    splitB = molB.split('\n')
    res = []
    res.extend(splitB[0:3])
    idx = 3
    inL = splitB[idx]
    res.append(inL)
    nAts = int(inL[0:3])
    nBonds = int(inL[3:6])

    idx += 1
    atLines = splitB[idx:idx + nAts]

    order = list(range(nAts))
    random.shuffle(order, random=random.random)

    for i in order:
        res.append(atLines[i])

    #print 'ORDER:',order
    idx += nAts
    for i in range(nBonds):
        inL = splitB[idx]
        idx1 = int(inL[0:3]) - 1
        idx2 = int(inL[3:6]) - 1
        idx1 = order.index(idx1)
        idx2 = order.index(idx2)
        inL = '% 3d% 3d' % (idx1 + 1, idx2 + 1) + inL[6:]
        res.append(inL)
        idx += 1

    #Charges
    for i in range(idx, len(splitB)):
        if splitB[i][0:6] == "M  CHG":
            line = splitB[i]
            chargeline = line.split()
            col = line[0:9]
            for i in range(3, len(chargeline), 2):
                col = col + "%4i%4i" % (order.index(int(chargeline[i]) - 1) +
                                        1, int(chargeline[i + 1]) + 1)
            #print col
            res.append(col)

    res.append('M  END')
    return '\n'.join(res)
Example #13
0
def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes):
  """ finds multiple quantization bounds for a single variable
  
   **Arguments**

     - vals: sequence of variable values (assumed to be floats)

     - nBounds: the number of quantization bounds to find

     - results: a list of result codes (should be integers)

     - nPossibleRes: an integer with the number of possible values of the
       result variable

   **Returns**

     - a 2-tuple containing:

       1) a list of the quantization bounds (floats)

       2) the information gain associated with this quantization


  """
  assert len(vals) == len(results), 'vals/results length mismatch'

  nData = len(vals)
  if nData == 0:
    return [],-1e8
  
  # sort the variable values:
  svs = list(zip(vals,results))
  svs.sort()
  sortVals,sortResults = zip(*svs)
  startNext=_FindStartPoints(sortVals,sortResults,nData)
  if not len(startNext):
    return [0],0.0
  if len(startNext)<nBounds:
    nBounds = len(startNext)-1
  if nBounds == 0:
    nBounds=1
  initCuts = list(range(nBounds))
  maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
                                      sortResults,nPossibleRes)
  quantBounds = []
  nVs = len(sortVals)
  for cut in bestCuts:
    idx = startNext[cut]
    if idx == nVs:
      quantBounds.append(sortVals[-1])
    elif idx == 0:
      quantBounds.append(sortVals[idx])
    else:
      quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)
      
  return quantBounds,maxGain
Example #14
0
def FindVarMultQuantBounds(vals, nBounds, results, nPossibleRes):
    """ finds multiple quantization bounds for a single variable
  
   **Arguments**

     - vals: sequence of variable values (assumed to be floats)

     - nBounds: the number of quantization bounds to find

     - results: a list of result codes (should be integers)

     - nPossibleRes: an integer with the number of possible values of the
       result variable

   **Returns**

     - a 2-tuple containing:

       1) a list of the quantization bounds (floats)

       2) the information gain associated with this quantization


  """
    assert len(vals) == len(results), 'vals/results length mismatch'

    nData = len(vals)
    if nData == 0:
        return [], -1e8

    # sort the variable values:
    svs = list(zip(vals, results))
    svs.sort()
    sortVals, sortResults = zip(*svs)
    startNext = _FindStartPoints(sortVals, sortResults, nData)
    if not len(startNext):
        return [0], 0.0
    if len(startNext) < nBounds:
        nBounds = len(startNext) - 1
    if nBounds == 0:
        nBounds = 1
    initCuts = list(range(nBounds))
    maxGain, bestCuts = _RecurseOnBounds(sortVals, initCuts, 0, startNext,
                                         sortResults, nPossibleRes)
    quantBounds = []
    nVs = len(sortVals)
    for cut in bestCuts:
        idx = startNext[cut]
        if idx == nVs:
            quantBounds.append(sortVals[-1])
        elif idx == 0:
            quantBounds.append(sortVals[idx])
        else:
            quantBounds.append((sortVals[idx] + sortVals[idx - 1]) / 2.)

    return quantBounds, maxGain
Example #15
0
def TestTree():
  """ testing code for named trees

  """
  examples1 = [['p1', 0, 1, 0, 0], ['p2', 0, 0, 0, 1], ['p3', 0, 0, 1, 2], ['p4', 0, 1, 1, 2],
               ['p5', 1, 0, 0, 2], ['p6', 1, 0, 1, 2], ['p7', 1, 1, 0, 2], ['p8', 1, 1, 1, 0]]
  attrs = list(range(1, len(examples1[0]) - 1))
  nPossibleVals = [0, 2, 2, 2, 3]
  t1 = ID3.ID3Boot(examples1, attrs, nPossibleVals, maxDepth=1)
  t1.Print()
Example #16
0
def TestTree():
    """ testing code for named trees

  """
    examples1 = [['p1', 0, 1, 0, 0], ['p2', 0, 0, 0, 1], ['p3', 0, 0, 1, 2],
                 ['p4', 0, 1, 1, 2], ['p5', 1, 0, 0, 2], ['p6', 1, 0, 1, 2],
                 ['p7', 1, 1, 0, 2], ['p8', 1, 1, 1, 0]]
    attrs = list(range(1, len(examples1[0]) - 1))
    nPossibleVals = [0, 2, 2, 2, 3]
    t1 = ID3.ID3Boot(examples1, attrs, nPossibleVals, maxDepth=1)
    t1.Print()
Example #17
0
def GetFeatFeatDistMatrix(fm, mergeMetric, mergeTol, dirMergeMode, compatFunc):
    """

    NOTE that mergeTol is a max value for merging when using distance-based
    merging and a min value when using score-based merging.
    
  """
    dists = [[1e8] * fm.GetNumFeatures() for x in range(fm.GetNumFeatures())]
    if mergeMetric == MergeMetric.NoMerge:
        return dists
    elif mergeMetric == MergeMetric.Distance:
        mergeTol2 = mergeTol * mergeTol
        for i in range(fm.GetNumFeatures()):
            ptI = fm.GetFeature(i)
            for j in range(i + 1, fm.GetNumFeatures()):
                ptJ = fm.GetFeature(j)
                if compatFunc(ptI, ptJ):
                    dist2 = ptI.GetDist2(ptJ)
                    if dist2 < mergeTol2:
                        dists[i][j] = dist2
                        dists[j][i] = dist2
    elif mergeMetric == MergeMetric.Overlap:
        for i in range(fm.GetNumFeatures()):
            ptI = fm.GetFeature(i)
            for j in range(i + 1, fm.GetNumFeatures()):
                ptJ = fm.GetFeature(j)
                if compatFunc(ptI, ptJ):
                    score = fm.GetFeatFeatScore(ptI, ptJ, typeMatch=False)
                    score *= -1 * ptJ.weight
                    if score < mergeTol:
                        dists[i][j] = score
                        dists[j][i] = score
    else:
        raise ValueError('unrecognized mergeMetric')

    return dists
Example #18
0
def GetFeatFeatDistMatrix(fm,mergeMetric,mergeTol,dirMergeMode,compatFunc):
  """

    NOTE that mergeTol is a max value for merging when using distance-based
    merging and a min value when using score-based merging.
    
  """
  dists = [[1e8]*fm.GetNumFeatures() for x in range(fm.GetNumFeatures())]
  if mergeMetric==MergeMetric.NoMerge:
    return dists
  elif mergeMetric==MergeMetric.Distance:
    mergeTol2 = mergeTol*mergeTol
    for i in range(fm.GetNumFeatures()):
      ptI = fm.GetFeature(i)
      for j in range(i+1,fm.GetNumFeatures()):
        ptJ = fm.GetFeature(j)
        if compatFunc(ptI,ptJ):
          dist2 = ptI.GetDist2(ptJ)
          if dist2<mergeTol2:
            dists[i][j]=dist2
            dists[j][i]=dist2
  elif mergeMetric==MergeMetric.Overlap:
    for i in range(fm.GetNumFeatures()):
      ptI = fm.GetFeature(i)
      for j in range(i+1,fm.GetNumFeatures()):
        ptJ = fm.GetFeature(j)
        if compatFunc(ptI,ptJ):
          score = fm.GetFeatFeatScore(ptI,ptJ,typeMatch=False)
          score *= -1*ptJ.weight
          if score<mergeTol:
            dists[i][j]=score
            dists[j][i]=score
  else:
    raise ValueError('unrecognized mergeMetric')

  return dists
Example #19
0
def TestQuantTree2():
  """ testing code for named trees

  """
  examples1 = [['p1', 0.1, 1, 0.1, 0], ['p2', 0.1, 0, 0.1, 1], ['p3', 0.1, 0, 1.1, 2],
               ['p4', 0.1, 1, 1.1, 2], ['p5', 1.1, 0, 0.1, 2], ['p6', 1.1, 0, 1.1, 2],
               ['p7', 1.1, 1, 0.1, 2], ['p8', 1.1, 1, 1.1, 0]]
  attrs = list(range(1, len(examples1[0]) - 1))
  nPossibleVals = [0, 0, 2, 0, 3]
  boundsPerVar = [0, 1, 0, 1, 0]

  t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar)
  t1.Print()
  t1.Pickle('test_data/QuantTree2.pkl')

  for example in examples1:
    print(example, t1.ClassifyExample(example))
Example #20
0
def TestQuantTree2():
    """ testing code for named trees

  """
    examples1 = [['p1', 0.1, 1, 0.1, 0], ['p2', 0.1, 0, 0.1, 1],
                 ['p3', 0.1, 0, 1.1, 2], ['p4', 0.1, 1, 1.1, 2],
                 ['p5', 1.1, 0, 0.1, 2], ['p6', 1.1, 0, 1.1, 2],
                 ['p7', 1.1, 1, 0.1, 2], ['p8', 1.1, 1, 1.1, 0]]
    attrs = list(range(1, len(examples1[0]) - 1))
    nPossibleVals = [0, 0, 2, 0, 3]
    boundsPerVar = [0, 1, 0, 1, 0]

    t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar)
    t1.Print()
    t1.Pickle('test_data/QuantTree2.pkl')

    for example in examples1:
        print(example, t1.ClassifyExample(example))
Example #21
0
def MaxCount(examples):
    """ given a set of examples, returns the most common result code

   **Arguments**

      examples: a list of examples to be counted

   **Returns**

     the most common result code
     
  """
    resList = [x[-1] for x in examples]
    maxVal = max(resList)
    counts = [None] * (maxVal + 1)
    for i in range(maxVal + 1):
        counts[i] = sum([x == i for x in resList])

    return numpy.argmax(counts)
Example #22
0
def MaxCount(examples):
  """ given a set of examples, returns the most common result code

   **Arguments**

      examples: a list of examples to be counted

   **Returns**

     the most common result code

  """
  resList = [x[-1] for x in examples]
  maxVal = max(resList)
  counts = [None] * (maxVal + 1)
  for i in range(maxVal + 1):
    counts[i] = sum([x == i for x in resList])

  return numpy.argmax(counts)
Example #23
0
def _GenVarTable(vals, cuts, starts, results, nPossibleRes):
    """ Primarily intended for internal use

   constructs a variable table for the data passed in
   The table for a given variable records the number of times each possible value
    of that variable appears for each possible result of the function.

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     the varTable, a 2D Numeric array which is nVarValues x nPossibleRes

   **Notes**

     - _vals_ should be sorted!
     
  """
    nVals = len(cuts) + 1
    varTable = numpy.zeros((nVals, nPossibleRes), 'i')
    idx = 0
    for i in range(nVals - 1):
        cut = cuts[i]
        while idx < starts[cut]:
            varTable[i, results[idx]] += 1
            idx += 1
    while idx < len(vals):
        varTable[-1, results[idx]] += 1
        idx += 1
    return varTable
Example #24
0
def _GenVarTable(vals, cuts, starts, results, nPossibleRes):
  """ Primarily intended for internal use

   constructs a variable table for the data passed in
   The table for a given variable records the number of times each possible value
    of that variable appears for each possible result of the function.

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     the varTable, a 2D Numeric array which is nVarValues x nPossibleRes

   **Notes**

     - _vals_ should be sorted!

  """
  nVals = len(cuts) + 1
  varTable = numpy.zeros((nVals, nPossibleRes), 'i')
  idx = 0
  for i in range(nVals - 1):
    cut = cuts[i]
    while idx < starts[cut]:
      varTable[i, results[idx]] += 1
      idx += 1
  while idx < len(vals):
    varTable[-1, results[idx]] += 1
    idx += 1
  return varTable
Example #25
0
def TestQuantTree():  # pragma: nocover
  """ Testing code for named trees

  The created pkl file is required by the unit test code.
  """
  examples1 = [['p1', 0, 1, 0.1, 0], ['p2', 0, 0, 0.1, 1], ['p3', 0, 0, 1.1, 2],
               ['p4', 0, 1, 1.1, 2], ['p5', 1, 0, 0.1, 2], ['p6', 1, 0, 1.1, 2],
               ['p7', 1, 1, 0.1, 2], ['p8', 1, 1, 1.1, 0]]
  attrs = list(range(1, len(examples1[0]) - 1))
  nPossibleVals = [0, 2, 2, 0, 3]
  boundsPerVar = [0, 0, 0, 1, 0]

  print('base')
  t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar)
  t1.Pickle('test_data/QuantTree1.pkl')
  t1.Print()

  print('depth limit')
  t1 = QuantTreeBoot(examples1, attrs, nPossibleVals, boundsPerVar, maxDepth=1)
  t1.Pickle('test_data/QuantTree1.pkl')
  t1.Print()
Example #26
0
def FindBest(resCodes,
             examples,
             nBoundsPerVar,
             nPossibleRes,
             nPossibleVals,
             attrs,
             exIndices=None,
             **kwargs):
    bestGain = -1e6
    best = -1
    bestBounds = []

    if exIndices is None:
        exIndices = list(range(len(examples)))

    if not len(exIndices):
        return best, bestGain, bestBounds

    nToTake = kwargs.get('randomDescriptors', 0)
    if nToTake > 0:
        nAttrs = len(attrs)
        if nToTake < nAttrs:
            ids = list(range(nAttrs))
            random.shuffle(ids, random=random.random)
            tmp = [attrs[x] for x in ids[:nToTake]]
            attrs = tmp

    for var in attrs:
        nBounds = nBoundsPerVar[var]
        if nBounds > 0:
            # vTable = map(lambda x,z=var:x[z],examples)
            try:
                vTable = [examples[x][var] for x in exIndices]
            except IndexError:
                print('index error retrieving variable: %d' % var)
                raise
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBounds, resCodes, nPossibleRes)
            # print('\tvar:',var,qBounds,gainHere)
        elif nBounds == 0:
            vTable = ID3.GenVarTable((examples[x] for x in exIndices),
                                     nPossibleVals, [var])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []
        if gainHere > bestGain:
            bestGain = gainHere
            bestBounds = qBounds
            best = var
        elif bestGain == gainHere:
            if len(qBounds) < len(bestBounds):
                best = var
                bestBounds = qBounds
    if best == -1:
        print('best unaltered')
        print('\tattrs:', attrs)
        print('\tnBounds:', numpy.take(nBoundsPerVar, attrs))
        print('\texamples:')
        for example in (examples[x] for x in exIndices):
            print('\t\t', example)

    if 0:
        print('BEST:', len(exIndices), best, bestGain, bestBounds)
        if (len(exIndices) < 10):
            print(len(exIndices), len(resCodes), len(examples))
            exs = [examples[x] for x in exIndices]
            vals = [x[best] for x in exs]
            sortIdx = numpy.argsort(vals)
            sortVals = [exs[x] for x in sortIdx]
            sortResults = [resCodes[x] for x in sortIdx]
            for i in range(len(vals)):
                print('   ', i, ['%.4f' % x for x in sortVals[i][1:-1]],
                      sortResults[i])
    return best, bestGain, bestBounds
Example #27
0
def _PyRecurseOnBounds(vals,
                       cuts,
                       which,
                       starts,
                       results,
                       nPossibleRes,
                       varTable=None):
    """ Primarily intended for internal use

   Recursively finds the best quantization boundaries

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables,
       this should be sorted

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - which: an integer indicating which bound is being adjusted here
       (and index into _cuts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     - a 2-tuple containing:

       1) the best information gain found so far

       2) a list of the quantization bound indices ( _cuts_ for the best case)
   
   **Notes**

    - this is not even remotely efficient, which is why a C replacement
      was written

  """
    nBounds = len(cuts)
    maxGain = -1e6
    bestCuts = None
    highestCutHere = len(starts) - nBounds + which
    if varTable is None:
        varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
    while cuts[which] <= highestCutHere:
        varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
        gainHere = entropy.InfoGain(varTable)
        if gainHere > maxGain:
            maxGain = gainHere
            bestCuts = cuts[:]
        # recurse on the next vars if needed
        if which < nBounds - 1:
            gainHere, cutsHere = _RecurseOnBounds(vals,
                                                  cuts[:],
                                                  which + 1,
                                                  starts,
                                                  results,
                                                  nPossibleRes,
                                                  varTable=varTable)
            if gainHere > maxGain:
                maxGain = gainHere
                bestCuts = cutsHere
        # update this cut
        cuts[which] += 1
        for i in range(which + 1, nBounds):
            if cuts[i] == cuts[i - 1]:
                cuts[i] += 1

    return maxGain, bestCuts
Example #28
0
def _Pruner(node, level=0):
    """Recursively finds and removes the nodes whose removals improve classification

     **Arguments**

       - node: the tree to be pruned.  The pruning data should already be contained
         within node (i.e. node.GetExamples() should return the pruning data)

       - level: (optional) the level of recursion, used only in _verbose printing
     

     **Returns**

        the pruned version of node


     **Notes**

      - This uses a greedy algorithm which basically does a DFS traversal of the tree,
        removing nodes whenever possible.
      
      - If removing a node does not affect the accuracy, it *will be* removed.  We
        favor smaller trees.
      
  """
    if _verbose:
        print('  ' * level, '<%d>  ' % level, '>>> Pruner')
    children = node.GetChildren()[:]

    bestTree = copy.deepcopy(node)
    bestErr = 1e6
    emptyChildren = []
    #
    # Loop over the children of this node, removing them when doing so
    #  either improves the local error or leaves it unchanged (we're
    #  introducing a bias for simpler trees).
    #
    for i in range(len(children)):
        child = children[i]
        examples = child.GetExamples()
        if _verbose:
            print('  ' * level, '<%d>  ' % level, ' Child:', i,
                  child.GetLabel())
            bestTree.Print()
            print()
        if len(examples):
            if _verbose:
                print('  ' * level, '<%d>  ' % level, '  Examples',
                      len(examples))
            if not child.GetTerminal():
                if _verbose:
                    print('  ' * level, '<%d>  ' % level, '    Nonterminal')

                workTree = copy.deepcopy(bestTree)
                #
                # First recurse on the child (try removing things below it)
                #
                newNode = _Pruner(child, level=level + 1)
                workTree.ReplaceChildIndex(i, newNode)
                tempErr = _GetLocalError(workTree)
                if tempErr <= bestErr:
                    bestErr = tempErr
                    bestTree = copy.deepcopy(workTree)
                    if _verbose:
                        print('  ' * level, '<%d>  ' % level, '>->->->->->')
                        print('  ' * level, '<%d>  ' % level, 'replacing:', i,
                              child.GetLabel())
                        child.Print()
                        print('  ' * level, '<%d>  ' % level, 'with:')
                        newNode.Print()
                        print('  ' * level, '<%d>  ' % level, '<-<-<-<-<-<')
                else:
                    workTree.ReplaceChildIndex(i, child)
                #
                # Now try replacing the child entirely
                #
                bestGuess = MaxCount(child.GetExamples())
                newNode = DecTree.DecTreeNode(workTree,
                                              'L:%d' % (bestGuess),
                                              label=bestGuess,
                                              isTerminal=1)
                newNode.SetExamples(child.GetExamples())
                workTree.ReplaceChildIndex(i, newNode)
                if _verbose:
                    print('  ' * level, '<%d>  ' % level, 'ATTEMPT:')
                    workTree.Print()
                newErr = _GetLocalError(workTree)
                if _verbose:
                    print('  ' * level, '<%d>  ' % level, '---> ', newErr,
                          bestErr)
                if newErr <= bestErr:
                    bestErr = newErr
                    bestTree = copy.deepcopy(workTree)
                    if _verbose:
                        print('  ' * level, '<%d>  ' % level, 'PRUNING:')
                        workTree.Print()
                else:
                    if _verbose:
                        print('  ' * level, '<%d>  ' % level, 'FAIL')
                    # whoops... put the child back in:
                    workTree.ReplaceChildIndex(i, child)
            else:
                if _verbose:
                    print('  ' * level, '<%d>  ' % level, '    Terminal')
        else:
            if _verbose:
                print('  ' * level, '<%d>  ' % level, '  No Examples',
                      len(examples))
            #
            # FIX:  we need to figure out what to do here (nodes that contain
            #   no examples in the testing set).  I can concoct arguments for
            #   leaving them in and for removing them.  At the moment they are
            #   left intact.
            #
            pass

    if _verbose:
        print('  ' * level, '<%d>  ' % level, '<<< out')
    return bestTree
Example #29
0
def BuildQuantTree(examples, target, attrs, nPossibleVals, nBoundsPerVar, depth=0, maxDepth=-1,
                   exIndices=None, **kwargs):
  """
    **Arguments**

      - examples: a list of lists (nInstances x nVariables+1) of variable
        values + instance values

      - target: an int

      - attrs: a list of ints indicating which variables can be used in the tree

      - nPossibleVals: a list containing the number of possible values of
                   every variable.

      - nBoundsPerVar: the number of bounds to include for each variable

      - depth: (optional) the current depth in the tree

      - maxDepth: (optional) the maximum depth to which the tree
                   will be grown
    **Returns**

     a QuantTree.QuantTreeNode with the decision tree

    **NOTE:** This code cannot bootstrap (start from nothing...)
          use _QuantTreeBoot_ (below) for that.
  """
  tree = QuantTree.QuantTreeNode(None, 'node')
  tree.SetData(-666)
  nPossibleRes = nPossibleVals[-1]

  if exIndices is None:
    exIndices = list(range(len(examples)))

  # counts of each result code:
  resCodes = [int(x[-1]) for x in (examples[y] for y in exIndices)]
  counts = [0] * nPossibleRes
  for res in resCodes:
    counts[res] += 1
  nzCounts = numpy.nonzero(counts)[0]

  if len(nzCounts) == 1:
    # bottomed out because there is only one result code left
    #  with any counts (i.e. there's only one type of example
    #  left... this is GOOD!).
    res = nzCounts[0]
    tree.SetLabel(res)
    tree.SetName(str(res))
    tree.SetTerminal(1)
  elif len(attrs) == 0 or (maxDepth >= 0 and depth > maxDepth):
    # Bottomed out: no variables left or max depth hit
    #  We don't really know what to do here, so
    #  use the heuristic of picking the most prevalent
    #  result
    v = numpy.argmax(counts)
    tree.SetLabel(v)
    tree.SetName('%d?' % v)
    tree.SetTerminal(1)
  else:
    # find the variable which gives us the largest information gain
    best, _, bestBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals,
                                   attrs, exIndices=exIndices, **kwargs)
    # remove that variable from the lists of possible variables
    nextAttrs = attrs[:]
    if not kwargs.get('recycleVars', 0):
      nextAttrs.remove(best)

    # set some info at this node
    tree.SetName('Var: %d' % (best))
    tree.SetLabel(best)
    tree.SetQuantBounds(bestBounds)
    tree.SetTerminal(0)

    # loop over possible values of the new variable and
    #  build a subtree for each one
    indices = exIndices[:]
    if len(bestBounds) > 0:
      for bound in bestBounds:
        nextExamples = []
        for index in indices[:]:
          ex = examples[index]
          if ex[best] < bound:
            nextExamples.append(index)
            indices.remove(index)

        if len(nextExamples) == 0:
          # this particular value of the variable has no examples,
          #  so there's not much sense in recursing.
          #  This can (and does) happen.
          v = numpy.argmax(counts)
          tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
        else:
          # recurse
          tree.AddChildNode(
            BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1,
                           maxDepth=maxDepth, exIndices=nextExamples, **kwargs))
      # add the last points remaining
      nextExamples = []
      for index in indices:
        nextExamples.append(index)
      if len(nextExamples) == 0:
        v = numpy.argmax(counts)
        tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
      else:
        tree.AddChildNode(
          BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1,
                         maxDepth=maxDepth, exIndices=nextExamples, **kwargs))
    else:
      for val in range(nPossibleVals[best]):
        nextExamples = []
        for idx in exIndices:
          if examples[idx][best] == val:
            nextExamples.append(idx)
        if len(nextExamples) == 0:
          v = numpy.argmax(counts)
          tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
        else:
          tree.AddChildNode(
            BuildQuantTree(examples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=depth + 1,
                           maxDepth=maxDepth, exIndices=nextExamples, **kwargs))
  return tree
Example #30
0
def _PyRecurseOnBounds(vals, cuts, which, starts, results, nPossibleRes, varTable=None):
  """ Primarily intended for internal use

   Recursively finds the best quantization boundaries

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables,
       this should be sorted

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - which: an integer indicating which bound is being adjusted here
       (and index into _cuts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     - a 2-tuple containing:

       1) the best information gain found so far

       2) a list of the quantization bound indices ( _cuts_ for the best case)

   **Notes**

    - this is not even remotely efficient, which is why a C replacement
      was written

  """
  nBounds = len(cuts)
  maxGain = -1e6
  bestCuts = None
  highestCutHere = len(starts) - nBounds + which
  if varTable is None:
    varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
  while cuts[which] <= highestCutHere:
    varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes)
    gainHere = entropy.InfoGain(varTable)
    if gainHere > maxGain:
      maxGain = gainHere
      bestCuts = cuts[:]
    # recurse on the next vars if needed
    if which < nBounds - 1:
      gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes,
                                            varTable=varTable)
      if gainHere > maxGain:
        maxGain = gainHere
        bestCuts = cutsHere
    # update this cut
    cuts[which] += 1
    for i in range(which + 1, nBounds):
      if cuts[i] == cuts[i - 1]:
        cuts[i] += 1

  return maxGain, bestCuts
Example #31
0
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1,
                  **kwargs):
  """ Bootstrapping code for the QuantTree

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
  attrs = list(attrs)
  for i in range(len(nBoundsPerVar)):
    if nBoundsPerVar[i] == -1 and i in attrs:
      attrs.remove(i)

  tree = QuantTree.QuantTreeNode(None, 'node')
  nPossibleRes = nPossibleVals[-1]
  tree._nResultCodes = nPossibleRes

  resCodes = [int(x[-1]) for x in examples]
  counts = [0] * nPossibleRes
  for res in resCodes:
    counts[res] += 1
  if initialVar is None:
    best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes,
                                       nPossibleVals, attrs, **kwargs)
  else:
    best = initialVar
    if nBoundsPerVar[best] > 0:
      vTable = map(lambda x, z=best: x[z], examples)
      qBounds, gainHere = Quantize.FindVarMultQuantBounds(vTable, nBoundsPerVar[best], resCodes,
                                                          nPossibleRes)
    elif nBoundsPerVar[best] == 0:
      vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0]
      gainHere = entropy.InfoGain(vTable)
      qBounds = []
    else:
      gainHere = -1e6
      qBounds = []

  tree.SetName('Var: %d' % (best))
  tree.SetData(gainHere)
  tree.SetLabel(best)
  tree.SetTerminal(0)
  tree.SetQuantBounds(qBounds)
  nextAttrs = list(attrs)
  if not kwargs.get('recycleVars', 0):
    nextAttrs.remove(best)

  indices = list(range(len(examples)))
  if len(qBounds) > 0:
    for bound in qBounds:
      nextExamples = []
      for index in list(indices):
        ex = examples[index]
        if ex[best] < bound:
          nextExamples.append(ex)
          indices.remove(index)

      if len(nextExamples):
        tree.AddChildNode(
          BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1,
                         maxDepth=maxDepth, **kwargs))
      else:
        v = numpy.argmax(counts)
        tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    # add the last points remaining
    nextExamples = []
    for index in indices:
      nextExamples.append(examples[index])
    if len(nextExamples) != 0:
      tree.AddChildNode(
        BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1,
                       maxDepth=maxDepth, **kwargs))
    else:
      v = numpy.argmax(counts)
      tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
  else:
    for val in range(nPossibleVals[best]):
      nextExamples = []
      for example in examples:
        if example[best] == val:
          nextExamples.append(example)
      if len(nextExamples) != 0:
        tree.AddChildNode(
          BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1,
                         maxDepth=maxDepth, **kwargs))
      else:
        v = numpy.argmax(counts)
        tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
  return tree
Example #32
0
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None,
             **kwargs):
  bestGain = -1e6
  best = -1
  bestBounds = []

  if exIndices is None:
    exIndices = list(range(len(examples)))

  if not len(exIndices):
    return best, bestGain, bestBounds

  nToTake = kwargs.get('randomDescriptors', 0)
  if nToTake > 0:
    nAttrs = len(attrs)
    if nToTake < nAttrs:
      ids = list(range(nAttrs))
      random.shuffle(ids, random=random.random)
      tmp = [attrs[x] for x in ids[:nToTake]]
      attrs = tmp

  for var in attrs:
    nBounds = nBoundsPerVar[var]
    if nBounds > 0:
      # vTable = map(lambda x,z=var:x[z],examples)
      try:
        vTable = [examples[x][var] for x in exIndices]
      except IndexError:
        print('index error retrieving variable: %d' % var)
        raise
      qBounds, gainHere = Quantize.FindVarMultQuantBounds(vTable, nBounds, resCodes, nPossibleRes)
      # print('\tvar:',var,qBounds,gainHere)
    elif nBounds == 0:
      vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0]
      gainHere = entropy.InfoGain(vTable)
      qBounds = []
    else:
      gainHere = -1e6
      qBounds = []
    if gainHere > bestGain:
      bestGain = gainHere
      bestBounds = qBounds
      best = var
    elif bestGain == gainHere:
      if len(qBounds) < len(bestBounds):
        best = var
        bestBounds = qBounds
  if best == -1:
    print('best unaltered')
    print('\tattrs:', attrs)
    print('\tnBounds:', numpy.take(nBoundsPerVar, attrs))
    print('\texamples:')
    for example in (examples[x] for x in exIndices):
      print('\t\t', example)

  if 0:
    print('BEST:', len(exIndices), best, bestGain, bestBounds)
    if (len(exIndices) < 10):
      print(len(exIndices), len(resCodes), len(examples))
      exs = [examples[x] for x in exIndices]
      vals = [x[best] for x in exs]
      sortIdx = numpy.argsort(vals)
      sortVals = [exs[x] for x in sortIdx]
      sortResults = [resCodes[x] for x in sortIdx]
      for i in range(len(vals)):
        print('   ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i])
  return best, bestGain, bestBounds
Example #33
0
def MergeFeatPoints(fm,
                    mergeMetric=MergeMetric.NoMerge,
                    mergeTol=1.5,
                    dirMergeMode=DirMergeMode.NoMerge,
                    mergeMethod=MergeMethod.WeightedAverage,
                    compatFunc=familiesMatch):
    """

    NOTE that mergeTol is a max value for merging when using distance-based
    merging and a min value when using score-based merging.

    returns whether or not any points were actually merged
    
  """
    res = False
    if mergeMetric == MergeMetric.NoMerge:
        return res
    dists = GetFeatFeatDistMatrix(fm, mergeMetric, mergeTol, dirMergeMode,
                                  compatFunc)
    distOrders = [None] * len(dists)
    for i in range(len(dists)):
        distV = dists[i]
        distOrders[i] = []
        for j, dist in enumerate(distV):
            if dist < mergeTol:
                distOrders[i].append((dist, j))
        distOrders[i].sort()

    #print 'distOrders:'
    #print distOrders

    # we now know the "distances" and have rank-ordered list of
    # each point's neighbors. Work with that.

    # progressively merge nearest neighbors until there
    # are no more points left to merge
    featsInPlay = list(range(fm.GetNumFeatures()))
    featsToRemove = []
    #print '--------------------------------'
    while featsInPlay:
        # find two features who are mutual nearest neighbors:
        fipCopy = featsInPlay[:]
        for fi in fipCopy:
            #print '>>>',fi,fipCopy,featsInPlay
            #print '\t',distOrders[fi]
            mergeThem = False
            if not distOrders[fi]:
                featsInPlay.remove(fi)
                continue
            dist, nbr = distOrders[fi][0]
            if nbr not in featsInPlay:
                continue
            if distOrders[nbr][0][1] == fi:
                #print 'direct:',fi,nbr
                mergeThem = True
            else:
                # it may be that there are several points at about the same distance,
                # check for that now
                if (feq(distOrders[nbr][0][0], dist)):
                    for distJ, nbrJ in distOrders[nbr][1:]:
                        if feq(dist, distJ):
                            if nbrJ == fi:
                                #print 'indirect: ',fi,nbr
                                mergeThem = True
                                break
                        else:
                            break
            #print '    bottom:',mergeThem
            if mergeThem:
                break
        if mergeThem:
            res = True
            featI = fm.GetFeature(fi)
            nbrFeat = fm.GetFeature(nbr)

            if mergeMethod == MergeMethod.WeightedAverage:
                newPos = featI.GetPos() * featI.weight + nbrFeat.GetPos(
                ) * nbrFeat.weight
                newPos /= (featI.weight + nbrFeat.weight)
                newWeight = (featI.weight + nbrFeat.weight) / 2
            elif mergeMethod == MergeMethod.Average:
                newPos = featI.GetPos() + nbrFeat.GetPos()
                newPos /= 2
                newWeight = (featI.weight + nbrFeat.weight) / 2
            elif mergeMethod == MergeMethod.UseLarger:
                if featI.weight > nbrFeat.weight:
                    newPos = featI.GetPos()
                    newWeight = featI.weight
                else:
                    newPos = nbrFeat.GetPos()
                    newWeight = nbrFeat.weight
            else:
                raise ValueError("bad mergeMethod")

            featI.SetPos(newPos)
            featI.weight = newWeight

            # nbr and fi are no longer valid targets:
            #print 'nbr done:',nbr,featsToRemove,featsInPlay
            featsToRemove.append(nbr)
            featsInPlay.remove(fi)
            featsInPlay.remove(nbr)
            for nbrList in distOrders:
                try:
                    nbrList.remove(fi)
                except ValueError:
                    pass
                try:
                    nbrList.remove(nbr)
                except ValueError:
                    pass
        else:
            #print ">>>> Nothing found, abort"
            break
    featsToRemove.sort()
    for i, fIdx in enumerate(featsToRemove):
        fm.DropFeature(fIdx - i)
    return res
Example #34
0
def FindBRICSBonds(mol, randomizeOrder=False, silent=True):
    """ returns the bonds in a molecule that BRICS would cleave

  >>> from rdkit import Chem
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(FindBRICSBonds(m))
  >>> res
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4'))]

  a more complicated case:
  >>> m = Chem.MolFromSmiles('CCCOCCC(=O)c1ccccc1')
  >>> res = list(FindBRICSBonds(m))
  >>> res
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))]

  we can also randomize the order of the results:
  >>> random.seed(23)
  >>> res = list(FindBRICSBonds(m,randomizeOrder=True))
  >>> sorted(res)
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))]

  Note that this is a generator function :
  >>> res = FindBRICSBonds(m)
  >>> res
  <generator object ...>
  >>> next(res)
  ((3, 2), ('3', '4'))

  >>> m = Chem.MolFromSmiles('CC=CC')
  >>> res = list(FindBRICSBonds(m))
  >>> sorted(res)
  [((1, 2), ('7', '7'))]

  make sure we don't match ring bonds:
  >>> m = Chem.MolFromSmiles('O=C1NCCC1')
  >>> list(FindBRICSBonds(m))
  []

  another nice one, make sure environment 8 doesn't match something connected
  to a ring atom:
  >>> m = Chem.MolFromSmiles('CC1(C)CCCCC1')
  >>> list(FindBRICSBonds(m))
  []

  """
    letter = re.compile('[a-z,A-Z]')
    indices = list(range(len(bondMatchers)))
    bondsDone = set()
    if randomizeOrder:
        random.shuffle(indices, random=random.random)

    envMatches = {}
    for env, patt in iteritems(environMatchers):
        envMatches[env] = mol.HasSubstructMatch(patt)
    for gpIdx in indices:
        if randomizeOrder:
            compats = bondMatchers[gpIdx][:]
            random.shuffle(compats, random=random.random)
        else:
            compats = bondMatchers[gpIdx]
        for i1, i2, bType, patt in compats:
            if not envMatches['L' + i1] or not envMatches['L' + i2]:
                continue
            matches = mol.GetSubstructMatches(patt)
            i1 = letter.sub('', i1)
            i2 = letter.sub('', i2)
            for match in matches:
                if match not in bondsDone and (match[1],
                                               match[0]) not in bondsDone:
                    bondsDone.add(match)
                    yield (((match[0], match[1]), (i1, i2)))
Example #35
0
def _Pruner(node, level=0):
  """Recursively finds and removes the nodes whose removals improve classification

     **Arguments**

       - node: the tree to be pruned.  The pruning data should already be contained
         within node (i.e. node.GetExamples() should return the pruning data)

       - level: (optional) the level of recursion, used only in _verbose printing


     **Returns**

        the pruned version of node


     **Notes**

      - This uses a greedy algorithm which basically does a DFS traversal of the tree,
        removing nodes whenever possible.

      - If removing a node does not affect the accuracy, it *will be* removed.  We
        favor smaller trees.

  """
  if _verbose:
    print('  ' * level, '<%d>  ' % level, '>>> Pruner')
  children = node.GetChildren()[:]

  bestTree = copy.deepcopy(node)
  bestErr = 1e6
  #
  # Loop over the children of this node, removing them when doing so
  #  either improves the local error or leaves it unchanged (we're
  #  introducing a bias for simpler trees).
  #
  for i in range(len(children)):
    child = children[i]
    examples = child.GetExamples()
    if _verbose:
      print('  ' * level, '<%d>  ' % level, ' Child:', i, child.GetLabel())
      bestTree.Print()
      print()
    if len(examples):
      if _verbose:
        print('  ' * level, '<%d>  ' % level, '  Examples', len(examples))
      if child.GetTerminal():
        if _verbose:
          print('  ' * level, '<%d>  ' % level, '    Terminal')
        continue

      if _verbose:
        print('  ' * level, '<%d>  ' % level, '    Nonterminal')

      workTree = copy.deepcopy(bestTree)
      #
      # First recurse on the child (try removing things below it)
      #
      newNode = _Pruner(child, level=level + 1)
      workTree.ReplaceChildIndex(i, newNode)
      tempErr = _GetLocalError(workTree)
      if tempErr <= bestErr:
        bestErr = tempErr
        bestTree = copy.deepcopy(workTree)
        if _verbose:
          print('  ' * level, '<%d>  ' % level, '>->->->->->')
          print('  ' * level, '<%d>  ' % level, 'replacing:', i, child.GetLabel())
          child.Print()
          print('  ' * level, '<%d>  ' % level, 'with:')
          newNode.Print()
          print('  ' * level, '<%d>  ' % level, '<-<-<-<-<-<')
      else:
        workTree.ReplaceChildIndex(i, child)
      #
      # Now try replacing the child entirely
      #
      bestGuess = MaxCount(child.GetExamples())
      newNode = DecTree.DecTreeNode(workTree, 'L:%d' % (bestGuess), label=bestGuess, isTerminal=1)
      newNode.SetExamples(child.GetExamples())
      workTree.ReplaceChildIndex(i, newNode)
      if _verbose:
        print('  ' * level, '<%d>  ' % level, 'ATTEMPT:')
        workTree.Print()
      newErr = _GetLocalError(workTree)
      if _verbose:
        print('  ' * level, '<%d>  ' % level, '---> ', newErr, bestErr)
      if newErr <= bestErr:
        bestErr = newErr
        bestTree = copy.deepcopy(workTree)
        if _verbose:
          print('  ' * level, '<%d>  ' % level, 'PRUNING:')
          workTree.Print()
      else:
        if _verbose:
          print('  ' * level, '<%d>  ' % level, 'FAIL')
        # whoops... put the child back in:
        workTree.ReplaceChildIndex(i, child)
    else:
      if _verbose:
        print('  ' * level, '<%d>  ' % level, '  No Examples', len(examples))
      #
      # FIX:  we need to figure out what to do here (nodes that contain
      #   no examples in the testing set).  I can concoct arguments for
      #   leaving them in and for removing them.  At the moment they are
      #   left intact.
      #
      pass

  if _verbose:
    print('  ' * level, '<%d>  ' % level, '<<< out')
  return bestTree
Example #36
0
        def test11(self):
            # test coordinate preservation:
            molblock = """
     RDKit          3D

 13 14  0  0  0  0  0  0  0  0999 V2000
   -1.2004    0.5900    0.6110 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.2328    1.3173    0.0343 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.4299    0.6533   -0.1500 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.3633   -0.7217   -0.3299 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.1552   -1.3791   -0.2207 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.1425   -0.7969    0.5335 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.1458   -1.4244    0.4108 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.2976   -0.7398   -0.1026 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.4889   -0.7939    0.5501 N   0  0  0  0  0  0  0  0  0  0  0  0
    3.4615    0.1460    0.3535 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.0116    1.4034   -0.0296 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.9786    1.4264   -0.9435 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.1399    0.3193   -0.9885 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  2  0
  4  5  1  0
  5  6  2  0
  6  7  1  0
  7  8  1  0
  8  9  2  0
  9 10  1  0
 10 11  2  0
 11 12  1  0
 12 13  2  0
  6  1  1  0
 13  8  1  0
M  END
"""
            m = Chem.MolFromMolBlock(molblock)
            pieces = BreakBRICSBonds(m)

            frags = Chem.GetMolFrags(pieces, asMols=True)
            self.assertEqual(len(frags), 3)
            self.assertEqual(frags[0].GetNumAtoms(), 7)
            self.assertEqual(frags[1].GetNumAtoms(), 3)
            self.assertEqual(frags[2].GetNumAtoms(), 7)

            c1 = m.GetConformer()
            c2 = frags[0].GetConformer()
            for i in range(6):
                p1 = c1.GetAtomPosition(i)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[2].GetConformer()
            for i in range(6):
                p1 = c1.GetAtomPosition(i + 7)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[1].GetConformer()
            for i in range(1):
                p1 = c1.GetAtomPosition(i + 6)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(5)
            p2 = c2.GetAtomPosition(1)
            self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(0)
            self.assertEqual((p1 - p2).Length(), 0.0)

            # make sure multiple conformations (include 2D) also work:
            molblock = """
     RDKit          2D

 13 14  0  0  0  0  0  0  0  0999 V2000
   -1.2990   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.5981   -1.6154    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8971   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8971    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.5981    1.3846    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.2990    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000    1.3846    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.2990    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.2990   -0.8654    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    2.5981   -1.6154    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8971   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8971    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.5981    1.3846    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  2  0
  4  5  1  0
  5  6  2  0
  6  7  1  0
  7  8  1  0
  8  9  2  0
  9 10  1  0
 10 11  2  0
 11 12  1  0
 12 13  2  0
  6  1  1  0
 13  8  1  0
M  END
"""
            m2 = Chem.MolFromMolBlock(molblock)
            m.AddConformer(m2.GetConformer(), assignId=True)
            self.assertEqual(m.GetNumConformers(), 2)

            pieces = BreakBRICSBonds(m)
            frags = Chem.GetMolFrags(pieces, asMols=True)
            self.assertEqual(len(frags), 3)
            self.assertEqual(frags[0].GetNumAtoms(), 7)
            self.assertEqual(frags[1].GetNumAtoms(), 3)
            self.assertEqual(frags[2].GetNumAtoms(), 7)
            self.assertEqual(frags[0].GetNumConformers(), 2)
            self.assertEqual(frags[1].GetNumConformers(), 2)
            self.assertEqual(frags[2].GetNumConformers(), 2)

            c1 = m.GetConformer(0)
            c2 = frags[0].GetConformer(0)
            for i in range(6):
                p1 = c1.GetAtomPosition(i)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[2].GetConformer(0)
            for i in range(6):
                p1 = c1.GetAtomPosition(i + 7)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[1].GetConformer(0)
            for i in range(1):
                p1 = c1.GetAtomPosition(i + 6)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(5)
            p2 = c2.GetAtomPosition(1)
            self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(0)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c1 = m.GetConformer(1)
            c2 = frags[0].GetConformer(1)
            for i in range(6):
                p1 = c1.GetAtomPosition(i)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[2].GetConformer(1)
            for i in range(6):
                p1 = c1.GetAtomPosition(i + 7)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(6)
            self.assertEqual((p1 - p2).Length(), 0.0)

            c2 = frags[1].GetConformer(1)
            for i in range(1):
                p1 = c1.GetAtomPosition(i + 6)
                p2 = c2.GetAtomPosition(i)
                self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(5)
            p2 = c2.GetAtomPosition(1)
            self.assertEqual((p1 - p2).Length(), 0.0)
            p1 = c1.GetAtomPosition(6)
            p2 = c2.GetAtomPosition(0)
            self.assertEqual((p1 - p2).Length(), 0.0)
Example #37
0
def QuantTreeBoot(examples,
                  attrs,
                  nPossibleVals,
                  nBoundsPerVar,
                  initialVar=None,
                  maxDepth=-1,
                  **kwargs):
    """ Bootstrapping code for the QuantTree

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
    attrs = list(attrs)
    for i in range(len(nBoundsPerVar)):
        if nBoundsPerVar[i] == -1 and i in attrs:
            attrs.remove(i)

    tree = QuantTree.QuantTreeNode(None, 'node')
    nPossibleRes = nPossibleVals[-1]
    tree._nResultCodes = nPossibleRes

    resCodes = [int(x[-1]) for x in examples]
    counts = [0] * nPossibleRes
    for res in resCodes:
        counts[res] += 1
    if initialVar is None:
        best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar,
                                           nPossibleRes, nPossibleVals, attrs,
                                           **kwargs)
    else:
        best = initialVar
        if nBoundsPerVar[best] > 0:
            vTable = map(lambda x, z=best: x[z], examples)
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBoundsPerVar[best], resCodes, nPossibleRes)
        elif nBoundsPerVar[best] == 0:
            vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []

    tree.SetName('Var: %d' % (best))
    tree.SetData(gainHere)
    tree.SetLabel(best)
    tree.SetTerminal(0)
    tree.SetQuantBounds(qBounds)
    nextAttrs = list(attrs)
    if not kwargs.get('recycleVars', 0):
        nextAttrs.remove(best)

    indices = list(range(len(examples)))
    if len(qBounds) > 0:
        for bound in qBounds:
            nextExamples = []
            for index in list(indices):
                ex = examples[index]
                if ex[best] < bound:
                    nextExamples.append(ex)
                    indices.remove(index)

            if len(nextExamples):
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
        # add the last points remaining
        nextExamples = []
        for index in indices:
            nextExamples.append(examples[index])
        if len(nextExamples) != 0:
            tree.AddChildNode(
                BuildQuantTree(nextExamples,
                               best,
                               nextAttrs,
                               nPossibleVals,
                               nBoundsPerVar,
                               depth=1,
                               maxDepth=maxDepth,
                               **kwargs))
        else:
            v = numpy.argmax(counts)
            tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    else:
        for val in range(nPossibleVals[best]):
            nextExamples = []
            for example in examples:
                if example[best] == val:
                    nextExamples.append(example)
            if len(nextExamples) != 0:
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    return tree
Example #38
0
def FindBRICSBonds(mol,randomizeOrder=False,silent=True):
  """ returns the bonds in a molecule that BRICS would cleave

  >>> from rdkit import Chem
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(FindBRICSBonds(m))
  >>> res
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4'))]

  a more complicated case:
  >>> m = Chem.MolFromSmiles('CCCOCCC(=O)c1ccccc1')
  >>> res = list(FindBRICSBonds(m))
  >>> res
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))]

  we can also randomize the order of the results:
  >>> random.seed(23)
  >>> res = list(FindBRICSBonds(m,randomizeOrder=True))
  >>> sorted(res)
  [((3, 2), ('3', '4')), ((3, 4), ('3', '4')), ((6, 8), ('6', '16'))]

  Note that this is a generator function :
  >>> res = FindBRICSBonds(m)
  >>> res
  <generator object ...>
  >>> next(res)
  ((3, 2), ('3', '4'))

  >>> m = Chem.MolFromSmiles('CC=CC')
  >>> res = list(FindBRICSBonds(m))
  >>> sorted(res)
  [((1, 2), ('7', '7'))]
  
  make sure we don't match ring bonds:
  >>> m = Chem.MolFromSmiles('O=C1NCCC1')
  >>> list(FindBRICSBonds(m))
  []
  
  another nice one, make sure environment 8 doesn't match something connected
  to a ring atom:
  >>> m = Chem.MolFromSmiles('CC1(C)CCCCC1')
  >>> list(FindBRICSBonds(m))
  []
  
  """
  letter = re.compile('[a-z,A-Z]')
  indices = list(range(len(bondMatchers)))
  bondsDone=set()
  if randomizeOrder: random.shuffle(indices,random=random.random)

  envMatches={}
  for env,patt in iteritems(environMatchers):
    envMatches[env]=mol.HasSubstructMatch(patt)
  for gpIdx in indices:
    if randomizeOrder:
      compats =bondMatchers[gpIdx][:]
      random.shuffle(compats,random=random.random)
    else:
      compats = bondMatchers[gpIdx]
    for i1,i2,bType,patt in compats:
      if not envMatches['L'+i1] or not envMatches['L'+i2]: continue
      matches = mol.GetSubstructMatches(patt)
      i1 = letter.sub('',i1)
      i2 = letter.sub('',i2)
      for match in matches:
        if match not in bondsDone and (match[1],match[0]) not in bondsDone:
          bondsDone.add(match)
          yield(((match[0],match[1]),(i1,i2)))
Example #39
0
def MergeFeatPoints(fm,mergeMetric=MergeMetric.NoMerge,mergeTol=1.5,
                    dirMergeMode=DirMergeMode.NoMerge,
                    mergeMethod=MergeMethod.WeightedAverage,
                    compatFunc=familiesMatch):
  """

    NOTE that mergeTol is a max value for merging when using distance-based
    merging and a min value when using score-based merging.

    returns whether or not any points were actually merged
    
  """
  res=False
  if mergeMetric==MergeMetric.NoMerge:
    return res
  dists = GetFeatFeatDistMatrix(fm,mergeMetric,mergeTol,dirMergeMode,compatFunc)
  distOrders = [None]*len(dists)
  for i in range(len(dists)):
    distV = dists[i]
    distOrders[i] = []
    for j,dist in enumerate(distV):
      if dist<mergeTol:
        distOrders[i].append((dist,j))
    distOrders[i].sort()

  #print 'distOrders:'
  #print distOrders

  # we now know the "distances" and have rank-ordered list of
  # each point's neighbors. Work with that.

  # progressively merge nearest neighbors until there
  # are no more points left to merge
  featsInPlay=list(range(fm.GetNumFeatures()))
  featsToRemove = []
  #print '--------------------------------'
  while featsInPlay:
    # find two features who are mutual nearest neighbors:
    fipCopy=featsInPlay[:]
    for fi in fipCopy:
      #print '>>>',fi,fipCopy,featsInPlay
      #print '\t',distOrders[fi]
      mergeThem=False
      if not distOrders[fi]:
        featsInPlay.remove(fi)
        continue
      dist,nbr = distOrders[fi][0]
      if nbr not in featsInPlay:
        continue
      if distOrders[nbr][0][1]==fi:
        #print 'direct:',fi,nbr
        mergeThem=True
      else:
        # it may be that there are several points at about the same distance,
        # check for that now
        if(feq(distOrders[nbr][0][0],dist)):
          for distJ,nbrJ in distOrders[nbr][1:]:
            if feq(dist,distJ):
              if nbrJ==fi:
                #print 'indirect: ',fi,nbr
                mergeThem=True
                break
            else:
              break
      #print '    bottom:',mergeThem
      if mergeThem: break
    if mergeThem:
      res=True
      featI = fm.GetFeature(fi)
      nbrFeat = fm.GetFeature(nbr)
      
      if mergeMethod==MergeMethod.WeightedAverage:
        newPos = featI.GetPos()*featI.weight+nbrFeat.GetPos()*nbrFeat.weight
        newPos /= (featI.weight+nbrFeat.weight)
        newWeight = (featI.weight+nbrFeat.weight)/2
      elif mergeMethod==MergeMethod.Average:
        newPos = featI.GetPos()+nbrFeat.GetPos()
        newPos /= 2
        newWeight = (featI.weight+nbrFeat.weight)/2
      elif mergeMethod==MergeMethod.UseLarger:
        if featI.weight>nbrFeat.weight:
          newPos=featI.GetPos()
          newWeight = featI.weight
        else:
          newPos=nbrFeat.GetPos()
          newWeight = nbrFeat.weight
      else:
        raise ValueError("bad mergeMethod")

      featI.SetPos(newPos)
      featI.weight = newWeight
      
      # nbr and fi are no longer valid targets:
      #print 'nbr done:',nbr,featsToRemove,featsInPlay
      featsToRemove.append(nbr)
      featsInPlay.remove(fi)
      featsInPlay.remove(nbr)
      for nbrList in distOrders:
        try:
          nbrList.remove(fi)
        except ValueError:
          pass
        try:
          nbrList.remove(nbr)
        except ValueError:
          pass
    else:
      #print ">>>> Nothing found, abort"
      break
  featsToRemove.sort()
  for i,fIdx in enumerate(featsToRemove):
    fm.DropFeature(fIdx-i)
  return res
Example #40
0
def BuildQuantTree(examples,
                   target,
                   attrs,
                   nPossibleVals,
                   nBoundsPerVar,
                   depth=0,
                   maxDepth=-1,
                   exIndices=None,
                   **kwargs):
    """
    **Arguments**

      - examples: a list of lists (nInstances x nVariables+1) of variable
        values + instance values

      - target: an int

      - attrs: a list of ints indicating which variables can be used in the tree

      - nPossibleVals: a list containing the number of possible values of
                   every variable.

      - nBoundsPerVar: the number of bounds to include for each variable

      - depth: (optional) the current depth in the tree

      - maxDepth: (optional) the maximum depth to which the tree
                   will be grown
    **Returns**

     a QuantTree.QuantTreeNode with the decision tree

    **NOTE:** This code cannot bootstrap (start from nothing...)
          use _QuantTreeBoot_ (below) for that.
  """
    tree = QuantTree.QuantTreeNode(None, 'node')
    tree.SetData(-666)
    nPossibleRes = nPossibleVals[-1]

    if exIndices is None:
        exIndices = list(range(len(examples)))

    # counts of each result code:
    resCodes = [int(x[-1]) for x in (examples[y] for y in exIndices)]
    counts = [0] * nPossibleRes
    for res in resCodes:
        counts[res] += 1
    nzCounts = numpy.nonzero(counts)[0]

    if len(nzCounts) == 1:
        # bottomed out because there is only one result code left
        #  with any counts (i.e. there's only one type of example
        #  left... this is GOOD!).
        res = nzCounts[0]
        tree.SetLabel(res)
        tree.SetName(str(res))
        tree.SetTerminal(1)
    elif len(attrs) == 0 or (maxDepth >= 0 and depth > maxDepth):
        # Bottomed out: no variables left or max depth hit
        #  We don't really know what to do here, so
        #  use the heuristic of picking the most prevalent
        #  result
        v = numpy.argmax(counts)
        tree.SetLabel(v)
        tree.SetName('%d?' % v)
        tree.SetTerminal(1)
    else:
        # find the variable which gives us the largest information gain
        best, _, bestBounds = FindBest(resCodes,
                                       examples,
                                       nBoundsPerVar,
                                       nPossibleRes,
                                       nPossibleVals,
                                       attrs,
                                       exIndices=exIndices,
                                       **kwargs)
        # remove that variable from the lists of possible variables
        nextAttrs = attrs[:]
        if not kwargs.get('recycleVars', 0):
            nextAttrs.remove(best)

        # set some info at this node
        tree.SetName('Var: %d' % (best))
        tree.SetLabel(best)
        tree.SetQuantBounds(bestBounds)
        tree.SetTerminal(0)

        # loop over possible values of the new variable and
        #  build a subtree for each one
        indices = exIndices[:]
        if len(bestBounds) > 0:
            for bound in bestBounds:
                nextExamples = []
                for index in indices[:]:
                    ex = examples[index]
                    if ex[best] < bound:
                        nextExamples.append(index)
                        indices.remove(index)

                if len(nextExamples) == 0:
                    # this particular value of the variable has no examples,
                    #  so there's not much sense in recursing.
                    #  This can (and does) happen.
                    v = numpy.argmax(counts)
                    tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
                else:
                    # recurse
                    tree.AddChildNode(
                        BuildQuantTree(examples,
                                       best,
                                       nextAttrs,
                                       nPossibleVals,
                                       nBoundsPerVar,
                                       depth=depth + 1,
                                       maxDepth=maxDepth,
                                       exIndices=nextExamples,
                                       **kwargs))
            # add the last points remaining
            nextExamples = []
            for index in indices:
                nextExamples.append(index)
            if len(nextExamples) == 0:
                v = numpy.argmax(counts)
                tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
            else:
                tree.AddChildNode(
                    BuildQuantTree(examples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=depth + 1,
                                   maxDepth=maxDepth,
                                   exIndices=nextExamples,
                                   **kwargs))
        else:
            for val in range(nPossibleVals[best]):
                nextExamples = []
                for idx in exIndices:
                    if examples[idx][best] == val:
                        nextExamples.append(idx)
                if len(nextExamples) == 0:
                    v = numpy.argmax(counts)
                    tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1)
                else:
                    tree.AddChildNode(
                        BuildQuantTree(examples,
                                       best,
                                       nextAttrs,
                                       nPossibleVals,
                                       nBoundsPerVar,
                                       depth=depth + 1,
                                       maxDepth=maxDepth,
                                       exIndices=nextExamples,
                                       **kwargs))
    return tree
Example #41
0
    def test11(self):
      # test coordinate preservation:
      molblock="""
     RDKit          3D

 13 14  0  0  0  0  0  0  0  0999 V2000
   -1.2004    0.5900    0.6110 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.2328    1.3173    0.0343 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.4299    0.6533   -0.1500 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.3633   -0.7217   -0.3299 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.1552   -1.3791   -0.2207 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.1425   -0.7969    0.5335 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.1458   -1.4244    0.4108 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.2976   -0.7398   -0.1026 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.4889   -0.7939    0.5501 N   0  0  0  0  0  0  0  0  0  0  0  0
    3.4615    0.1460    0.3535 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.0116    1.4034   -0.0296 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.9786    1.4264   -0.9435 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.1399    0.3193   -0.9885 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  2  0
  4  5  1  0
  5  6  2  0
  6  7  1  0
  7  8  1  0
  8  9  2  0
  9 10  1  0
 10 11  2  0
 11 12  1  0
 12 13  2  0
  6  1  1  0
 13  8  1  0
M  END
"""
      m = Chem.MolFromMolBlock(molblock)
      pieces = BreakBRICSBonds(m)

      frags = Chem.GetMolFrags(pieces,asMols=True)
      self.assertEqual(len(frags),3)
      self.assertEqual(frags[0].GetNumAtoms(),7)
      self.assertEqual(frags[1].GetNumAtoms(),3)
      self.assertEqual(frags[2].GetNumAtoms(),7)

      c1 = m.GetConformer()
      c2 = frags[0].GetConformer()
      for i in range(6):
        p1 = c1.GetAtomPosition(i)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[2].GetConformer()
      for i in range(6):
        p1 = c1.GetAtomPosition(i+7)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[1].GetConformer()
      for i in range(1):
        p1 = c1.GetAtomPosition(i+6)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(5)
      p2 = c2.GetAtomPosition(1)
      self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(0)
      self.assertEqual((p1-p2).Length(),0.0)


      # make sure multiple conformations (include 2D) also work:
      molblock="""
     RDKit          2D

 13 14  0  0  0  0  0  0  0  0999 V2000
   -1.2990   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.5981   -1.6154    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8971   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -3.8971    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -2.5981    1.3846    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.2990    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000    1.3846    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
    1.2990    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.2990   -0.8654    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
    2.5981   -1.6154    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8971   -0.8654    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8971    0.6346    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.5981    1.3846    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  2  0
  4  5  1  0
  5  6  2  0
  6  7  1  0
  7  8  1  0
  8  9  2  0
  9 10  1  0
 10 11  2  0
 11 12  1  0
 12 13  2  0
  6  1  1  0
 13  8  1  0
M  END
"""
      m2 = Chem.MolFromMolBlock(molblock)
      m.AddConformer(m2.GetConformer(),assignId=True)
      self.assertEqual(m.GetNumConformers(),2)

      pieces = BreakBRICSBonds(m)
      frags = Chem.GetMolFrags(pieces,asMols=True)
      self.assertEqual(len(frags),3)
      self.assertEqual(frags[0].GetNumAtoms(),7)
      self.assertEqual(frags[1].GetNumAtoms(),3)
      self.assertEqual(frags[2].GetNumAtoms(),7)
      self.assertEqual(frags[0].GetNumConformers(),2)
      self.assertEqual(frags[1].GetNumConformers(),2)
      self.assertEqual(frags[2].GetNumConformers(),2)

      c1 = m.GetConformer(0)
      c2 = frags[0].GetConformer(0)
      for i in range(6):
        p1 = c1.GetAtomPosition(i)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[2].GetConformer(0)
      for i in range(6):
        p1 = c1.GetAtomPosition(i+7)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[1].GetConformer(0)
      for i in range(1):
        p1 = c1.GetAtomPosition(i+6)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(5)
      p2 = c2.GetAtomPosition(1)
      self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(0)
      self.assertEqual((p1-p2).Length(),0.0)

      c1 = m.GetConformer(1)
      c2 = frags[0].GetConformer(1)
      for i in range(6):
        p1 = c1.GetAtomPosition(i)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[2].GetConformer(1)
      for i in range(6):
        p1 = c1.GetAtomPosition(i+7)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(6)
      self.assertEqual((p1-p2).Length(),0.0)

      c2 = frags[1].GetConformer(1)
      for i in range(1):
        p1 = c1.GetAtomPosition(i+6)
        p2 = c2.GetAtomPosition(i)
        self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(5)
      p2 = c2.GetAtomPosition(1)
      self.assertEqual((p1-p2).Length(),0.0)
      p1 = c1.GetAtomPosition(6)
      p2 = c2.GetAtomPosition(0)
      self.assertEqual((p1-p2).Length(),0.0)