def GenVarTable(examples, nPossibleVals, vars): """Generates a list of variable tables for the examples passed in. The table for a given variable records the number of times each possible value of that variable appears for each possible result of the function. **Arguments** - examples: a list (nInstances long) of lists of variable values + instance values - nPossibleVals: a list containing the number of possible values of each variable + the number of values of the function. - vars: a list of the variables to include in the var table **Returns** a list of variable result tables. Each table is a Numeric array which is varValues x nResults """ nVars = len(vars) res = [None] * nVars nFuncVals = nPossibleVals[-1] for i in xrange(nVars): res[i] = numpy.zeros((nPossibleVals[vars[i]], nFuncVals), 'i') for example in examples: val = int(example[-1]) for i in xrange(nVars): res[i][int(example[vars[i]]), val] += 1 return res
def ConstructNodes(self, nodeCounts, actFunc, actFuncParms): """ build an unconnected network and set node counts **Arguments** - nodeCounts: a list containing the number of nodes to be in each layer. the ordering is: (nInput,nHidden1,nHidden2, ... , nHiddenN, nOutput) """ self.nodeCounts = nodeCounts self.numInputNodes = nodeCounts[0] self.numOutputNodes = nodeCounts[-1] self.numHiddenLayers = len(nodeCounts) - 2 self.numInHidden = [None] * self.numHiddenLayers for i in xrange(self.numHiddenLayers): self.numInHidden[i] = nodeCounts[i + 1] numNodes = sum(self.nodeCounts) self.nodeList = [None] * (numNodes) for i in xrange(numNodes): self.nodeList[i] = NetNode.NetNode(i, self.nodeList, actFunc=actFunc, actFuncParms=actFuncParms) self.layerIndices = [None] * len(nodeCounts) start = 0 for i in xrange(len(nodeCounts)): end = start + nodeCounts[i] self.layerIndices[i] = range(start, end) start = end
def ConstructNodes(self,nodeCounts,actFunc,actFuncParms): """ build an unconnected network and set node counts **Arguments** - nodeCounts: a list containing the number of nodes to be in each layer. the ordering is: (nInput,nHidden1,nHidden2, ... , nHiddenN, nOutput) """ self.nodeCounts = nodeCounts self.numInputNodes = nodeCounts[0] self.numOutputNodes = nodeCounts[-1] self.numHiddenLayers = len(nodeCounts)-2 self.numInHidden = [None]*self.numHiddenLayers for i in xrange(self.numHiddenLayers): self.numInHidden[i] = nodeCounts[i+1] numNodes = sum(self.nodeCounts) self.nodeList = [None]*(numNodes) for i in xrange(numNodes): self.nodeList[i] = NetNode.NetNode(i,self.nodeList, actFunc=actFunc, actFuncParms=actFuncParms) self.layerIndices = [None]*len(nodeCounts) start = 0 for i in xrange(len(nodeCounts)): end = start + nodeCounts[i] self.layerIndices[i] = range(start,end) start = end
def GenVarTable(examples,nPossibleVals,vars): """Generates a list of variable tables for the examples passed in. The table for a given variable records the number of times each possible value of that variable appears for each possible result of the function. **Arguments** - examples: a list (nInstances long) of lists of variable values + instance values - nPossibleVals: a list containing the number of possible values of each variable + the number of values of the function. - vars: a list of the variables to include in the var table **Returns** a list of variable result tables. Each table is a Numeric array which is varValues x nResults """ nVars = len(vars) res = [None]*nVars nFuncVals = nPossibleVals[-1] for i in xrange(nVars): res[i] = numpy.zeros((nPossibleVals[vars[i]],nFuncVals),'i') for example in examples: val = int(example[-1]) for i in xrange(nVars): res[i][int(example[vars[i]]),val] += 1 return res
def ClassifyExample(self,example,appendExamples=0): """ classifies a given example and returns the results of the output layer. **Arguments** - example: the example to be classified **NOTE:** if the output layer is only one element long, a scalar (not a list) will be returned. This is why a lot of the other network code claims to only support single valued outputs. """ if len(example) > self.numInputNodes: if len(example)-self.numInputNodes > self.numOutputNodes: example = example[1:-self.numOutputNodes] else: example = example[:-self.numOutputNodes] assert len(example) == self.numInputNodes totNumNodes = sum(self.nodeCounts) results = numpy.zeros(totNumNodes,numpy.float64) for i in xrange(self.numInputNodes): results[i] = example[i] for i in xrange(self.numInputNodes,totNumNodes): self.nodeList[i].Eval(results) self.lastResults = results[:] if self.numOutputNodes == 1: return results[-1] else: return results
def ClassifyExample(self, example, appendExamples=0): """ classifies a given example and returns the results of the output layer. **Arguments** - example: the example to be classified **NOTE:** if the output layer is only one element long, a scalar (not a list) will be returned. This is why a lot of the other network code claims to only support single valued outputs. """ if len(example) > self.numInputNodes: if len(example) - self.numInputNodes > self.numOutputNodes: example = example[1:-self.numOutputNodes] else: example = example[:-self.numOutputNodes] assert len(example) == self.numInputNodes totNumNodes = sum(self.nodeCounts) results = numpy.zeros(totNumNodes, numpy.float64) for i in xrange(self.numInputNodes): results[i] = example[i] for i in xrange(self.numInputNodes, totNumNodes): self.nodeList[i].Eval(results) self.lastResults = results[:] if self.numOutputNodes == 1: return results[-1] else: return results
def CalcNPossibleUsingMap(data,order,qBounds,nQBounds=None): """ calculates the number of possible values for each variable in a data set **Arguments** - data: a list of examples - order: the ordering map between the variables in _data_ and _qBounds_ - qBounds: the quantization bounds for the variables **Returns** a list with the number of possible values each variable takes on in the data set **Notes** - variables present in _qBounds_ will have their _nPossible_ number read from _qbounds - _nPossible_ for other numeric variables will be calculated """ numericTypes = [int, float] if six.PY2: numericTypes.append(long) print('order:',order, len(order)) print('qB:',qBounds) #print('nQB:',nQBounds, len(nQBounds)) assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\ 'order/qBounds mismatch' nVars = len(order) nPossible = [-1]*nVars cols = range(nVars) for i in xrange(nVars): if nQBounds and nQBounds[i] != 0: nPossible[i] = -1 cols.remove(i) elif len(qBounds[i])>0: nPossible[i] = len(qBounds[i]) cols.remove(i) nPts = len(data) for i in xrange(nPts): for col in cols[:]: d = data[i][order[col]] if type(d) in numericTypes: if int(d) == d: nPossible[col] = max(int(d),nPossible[col]) else: nPossible[col] = -1 cols.remove(col) else: print('bye bye col %d: %s'%(col,repr(d))) nPossible[col] = -1 cols.remove(col) return list(map(lambda x:int(x)+1,nPossible))
def CalcNPossibleUsingMap(data, order, qBounds, nQBounds=None): """ calculates the number of possible values for each variable in a data set **Arguments** - data: a list of examples - order: the ordering map between the variables in _data_ and _qBounds_ - qBounds: the quantization bounds for the variables **Returns** a list with the number of possible values each variable takes on in the data set **Notes** - variables present in _qBounds_ will have their _nPossible_ number read from _qbounds - _nPossible_ for other numeric variables will be calculated """ numericTypes = [int, float] if six.PY2: numericTypes.append(long) print('order:', order, len(order)) print('qB:', qBounds) #print('nQB:',nQBounds, len(nQBounds)) assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\ 'order/qBounds mismatch' nVars = len(order) nPossible = [-1] * nVars cols = range(nVars) for i in xrange(nVars): if nQBounds and nQBounds[i] != 0: nPossible[i] = -1 cols.remove(i) elif len(qBounds[i]) > 0: nPossible[i] = len(qBounds[i]) cols.remove(i) nPts = len(data) for i in xrange(nPts): for col in cols[:]: d = data[i][order[col]] if type(d) in numericTypes: if int(d) == d: nPossible[col] = max(int(d), nPossible[col]) else: nPossible[col] = -1 cols.remove(col) else: print('bye bye col %d: %s' % (col, repr(d))) nPossible[col] = -1 cols.remove(col) return list(map(lambda x: int(x) + 1, nPossible))
def test3Classify(self): " testing classification " self._setupTree1() self._setupTree2() for i in xrange(len(self.examples1)): assert self.t1.ClassifyExample(self.examples1[i])==self.examples1[i][-1],\ 'examples1[%d] misclassified'%i for i in xrange(len(self.examples2)): assert self.t2.ClassifyExample(self.examples2[i])==self.examples2[i][-1],\ 'examples2[%d] misclassified'%i
def TypeFinder(data, nRows, nCols, nullMarker=None): """ finds the types of the columns in _data_ if nullMarker is not None, elements of the data table which are equal to nullMarker will not count towards setting the type of their columns. """ priorities = {float: 3, int: 2, str: 1, -1: -1} res = [None] * nCols for col in xrange(nCols): typeHere = [-1, 1] for row in xrange(nRows): d = data[row][col] if d is None: continue locType = type(d) if locType != float and locType != int: locType = str try: d = str(d) except UnicodeError as msg: print( 'cannot convert text from row %d col %d to a string' % (row + 2, col)) print('\t>%s' % (repr(d))) raise UnicodeError(msg) else: typeHere[1] = max(typeHere[1], len(str(d))) if isinstance(d, string_types): if nullMarker is None or d != nullMarker: l = max(len(d), typeHere[1]) typeHere = [str, l] else: try: fD = float(int(d)) except OverflowError: locType = float else: if fD == d: locType = int if not isinstance(typeHere[0], string_types) and \ priorities[locType] > priorities[typeHere[0]]: typeHere[0] = locType res[col] = typeHere return res
def testSingleCalcs(self): " testing calculation of a single descriptor " for i in xrange(len(self.cExprs)): cExpr= self.cExprs[i] argVect = self.piece1 + [cExpr] res = Parser.CalcSingleCompoundDescriptor(self.compos,argVect,self.aDict,self.pDict) self.assertAlmostEqual(res,self.results[i],2)
def CrossValidate(tree, testExamples, appendExamples=0): """ Determines the classification error for the testExamples **Arguments** - tree: a decision tree (or anything supporting a _ClassifyExample()_ method) - testExamples: a list of examples to be used for testing - appendExamples: a toggle which is passed along to the tree as it does the classification. The trees can use this to store the examples they classify locally. **Returns** a 2-tuple consisting of: 1) the percent error of the tree 2) a list of misclassified examples """ nTest = len(testExamples) nBad = 0 badExamples = [] for i in xrange(nTest): testEx = testExamples[i] trueRes = testEx[-1] res = tree.ClassifyExample(testEx, appendExamples) if (trueRes != res).any(): badExamples.append(testEx) nBad += 1 return float(nBad) / nTest, badExamples
def CharacteristicPolynomial(mol,mat=None): """ calculates the characteristic polynomial for a molecular graph if mat is not passed in, the molecule's Weighted Adjacency Matrix will be used. The approach used is the Le Verrier-Faddeev-Frame method described in _Chemical Graph Theory, 2nd Edition_ by Nenad Trinajstic (CRC Press, 1992), pg 76. """ nAtoms = mol.GetNumAtoms() if mat is None: # FIX: complete this: #A = mol.GetWeightedAdjacencyMatrix() pass else: A = mat I = 1.*numpy.identity(nAtoms) An = A res = numpy.zeros(nAtoms+1,numpy.float) res[0] = 1.0 for n in xrange(1,nAtoms+1): res[n] = 1./n*numpy.trace(An) Bn = An - res[n]*I An = numpy.dot(A,Bn) res[1:] *= -1 return res
def PyInfoGain(varMat): """ calculates the information gain for a variable **Arguments** varMat is a Numeric array with the number of possible occurances of each result for reach possible value of the given variable. So, for a variable which adopts 4 possible values and a result which has 3 possible values, varMat would be 4x3 **Returns** The expected information gain """ variableRes = numpy.sum(varMat,1) # indexed by variable, Sv in Mitchell's notation overallRes = numpy.sum(varMat,0) # indexed by result, S in Mitchell's notation term2 = 0 for i in xrange(len(variableRes)): term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) tSum = sum(overallRes) if tSum != 0.0: term2 = 1./tSum * term2 gain = InfoEntropy(overallRes) - term2 else: gain = 0 return gain
def ProcessSimpleList(self): """ Handles the list of simple descriptors This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_. There's some other magic going on that I can't decipher at the moment. """ global countOptions self.nonZeroDescriptors = [] lCopy = self.simpleList[:] tList = map(lambda x: x[0], countOptions) for i in xrange(len(lCopy)): entry = lCopy[i] if 'NONZERO' in entry[1]: if entry[0] not in tList: self.nonZeroDescriptors.append('%s != 0' % entry[0]) if len(entry[1]) == 1: self.simpleList.remove(entry) else: self.simpleList[self.simpleList.index(entry)][1].remove( 'NONZERO') self.requiredDescriptors = map(lambda x: x[0], self.simpleList) for entry in tList: if entry in self.requiredDescriptors: self.requiredDescriptors.remove(entry)
def ReadVars(inFile): """ reads the variables and quantization bounds from a .qdat or .dat file **Arguments** - inFile: a file object **Returns** a 2-tuple containing: 1) varNames: a list of the variable names 2) qbounds: the list of quantization bounds for each variable """ varNames = [] qBounds = [] fileutils.MoveToMatchingLine(inFile,'Variable Table') inLine = inFile.readline() while inLine.find('# ----') == -1: splitLine = inLine[2:].split('[') varNames.append(splitLine[0].strip()) qBounds.append(splitLine[1][:-2]) inLine = inFile.readline() for i in xrange(len(qBounds)): if qBounds[i] != '': l = qBounds[i].split(',') qBounds[i] = [] for item in l: qBounds[i].append(float(item)) else: qBounds[i] = [] return varNames,qBounds
def testTreeGrow(self): " testing tree-based composite " with open( RDConfig.RDCodeDir + '/ML/Composite/test_data/composite_base.pkl', 'rb') as pklF: self.refCompos = cPickle.load(pklF) composite = Composite.Composite() composite._varNames = self.varNames composite.SetQuantBounds(self.qBounds, self.nPoss) from rdkit.ML.DecTree import CrossValidate driver = CrossValidate.CrossValidationDriver pruner = None composite.Grow(self.examples, self.attrs, [], buildDriver=driver, pruner=pruner, nTries=100, silent=1) composite.AverageErrors() composite.SortModels() #with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF: # cPickle.dump(composite,pklF) self.treeComposite = composite self.assertEqual(len(composite), len(self.refCompos)) for i in xrange(len(composite)): t1, c1, e1 = composite[i] t2, c2, e2 = self.refCompos[i] self.assertEqual(e1, e2)
def ChooseOptimalRoot(examples, trainExamples, testExamples, attrs, nPossibleVals, treeBuilder, nQuantBounds=[], **kwargs): """ loops through all possible tree roots and chooses the one which produces the best tree **Arguments** - examples: the full set of examples - trainExamples: the training examples - testExamples: the testing examples - attrs: a list of attributes to consider in the tree building - nPossibleVals: a list of the number of possible values each variable can adopt - treeBuilder: the function to be used to actually build the tree - nQuantBounds: an optional list. If present, it's assumed that the builder algorithm takes this argument as well (for building QuantTrees) **Returns** The best tree found **Notes** 1) Trees are built using _trainExamples_ 2) Testing of each tree (to determine which is best) is done using _CrossValidate_ and the entire set of data (i.e. all of _examples_) 3) _trainExamples_ is not used at all, which immediately raises the question of why it's even being passed in """ attrs = attrs[:] if nQuantBounds: for i in range(len(nQuantBounds)): if nQuantBounds[i] == -1 and i in attrs: attrs.remove(i) nAttrs = len(attrs) trees = [None] * nAttrs errs = [0] * nAttrs errs[0] = 1e6 for i in xrange(1, nAttrs): argD = {'initialVar': attrs[i]} argD.update(kwargs) if nQuantBounds is None or nQuantBounds == []: trees[i] = treeBuilder(trainExamples, attrs, nPossibleVals, **argd) else: trees[i] = treeBuilder(trainExamples, attrs, nPossibleVals, nQuantBounds, **argD) if trees[i]: errs[i], foo = CrossValidate(trees[i], examples, appendExamples=0) else: errs[i] = 1e6 best = numpy.argmin(errs) # FIX: this used to say 'trees[i]', could that possibly have been right? return trees[best]
def CalcCompoundDescriptorsForComposition(self, compos='', composList=None, propDict={}): """ calculates all simple descriptors for a given composition **Arguments** - compos: a string representation of the composition - composList: a *composVect* - propDict: a dictionary containing the properties of the composition as a whole (e.g. structural variables, etc.) The client must provide either _compos_ or _composList_. If both are provided, _composList_ takes priority. **Returns** the list of descriptor values **Notes** - when _compos_ is provided, this uses _chemutils.SplitComposition_ to split the composition into its individual pieces """ if composList is None: composList = chemutils.SplitComposition(compos) res = [] for i in xrange(len(self.compoundList)): val = Parser.CalcSingleCompoundDescriptor(composList, self.compoundList[i][1:], self.atomDict, propDict) res.append(val) return res
def ProcessSimpleList(self): """ Handles the list of simple descriptors This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_. There's some other magic going on that I can't decipher at the moment. """ global countOptions self.nonZeroDescriptors = [] lCopy = self.simpleList[:] tList = map(lambda x: x[0], countOptions) for i in xrange(len(lCopy)): entry = lCopy[i] if 'NONZERO' in entry[1]: if entry[0] not in tList: self.nonZeroDescriptors.append('%s != 0' % entry[0]) if len(entry[1]) == 1: self.simpleList.remove(entry) else: self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO') self.requiredDescriptors = map(lambda x: x[0], self.simpleList) for entry in tList: if entry in self.requiredDescriptors: self.requiredDescriptors.remove(entry)
def testTreeGrow(self): " testing tree-based composite " with open(RDConfig.RDCodeDir + "/ML/Composite/test_data/composite_base.pkl", "r") as pklTF: buf = pklTF.read().replace("\r\n", "\n").encode("utf-8") pklTF.close() with io.BytesIO(buf) as pklF: self.refCompos = cPickle.load(pklF) composite = Composite.Composite() composite._varNames = self.varNames composite.SetQuantBounds(self.qBounds, self.nPoss) from rdkit.ML.DecTree import CrossValidate driver = CrossValidate.CrossValidationDriver pruner = None composite.Grow(self.examples, self.attrs, [], buildDriver=driver, pruner=pruner, nTries=100, silent=1) composite.AverageErrors() composite.SortModels() # with open(RDConfig.RDCodeDir+'/ML/Composite/test_data/composite_base.pkl','wb') as pklF: # cPickle.dump(composite,pklF) self.treeComposite = composite self.assertEqual(len(composite), len(self.refCompos)) for i in xrange(len(composite)): t1, c1, e1 = composite[i] t2, c2, e2 = self.refCompos[i] self.assertEqual(e1, e2)
def PyInfoGain(varMat): """ calculates the information gain for a variable **Arguments** varMat is a Numeric array with the number of possible occurances of each result for reach possible value of the given variable. So, for a variable which adopts 4 possible values and a result which has 3 possible values, varMat would be 4x3 **Returns** The expected information gain """ variableRes = numpy.sum(varMat, 1) # indexed by variable, Sv in Mitchell's notation overallRes = numpy.sum(varMat, 0) # indexed by result, S in Mitchell's notation term2 = 0 for i in xrange(len(variableRes)): term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) tSum = sum(overallRes) if tSum != 0.0: term2 = 1. / tSum * term2 gain = InfoEntropy(overallRes) - term2 else: gain = 0 return gain
def testSingleCalcs(self): " testing calculation of a single descriptor " for i in xrange(len(self.cExprs)): cExpr = self.cExprs[i] argVect = self.piece1 + [cExpr] res = Parser.CalcSingleCompoundDescriptor(self.compos, argVect, self.aDict, self.pDict) self.assertAlmostEqual(res, self.results[i], 2)
def _CalcNPossible(self, data): """calculates the number of possible values of each variable (where possible) **Arguments** -data: a list of examples to be used **Returns** a list of nPossible values for each variable """ nVars = self.GetNVars() + self.nResults nPossible = [-1] * nVars cols = list(xrange(nVars)) for i, bounds in enumerate(self.qBounds): if len(bounds) > 0: nPossible[i] = len(bounds) cols.remove(i) nPts = self.GetNPts() for i, pt in enumerate(self.data): for col in cols[:]: d = pt[col] if type(d) in numericTypes: if math.floor(d) == d: nPossible[col] = max(math.floor(d), nPossible[col]) else: nPossible[col] = -1 cols.remove(col) else: nPossible[col] = -1 cols.remove(col) return [int(x) + 1 for x in nPossible]
def ReadVars(inFile): """ reads the variables and quantization bounds from a .qdat or .dat file **Arguments** - inFile: a file object **Returns** a 2-tuple containing: 1) varNames: a list of the variable names 2) qbounds: the list of quantization bounds for each variable """ varNames = [] qBounds = [] fileutils.MoveToMatchingLine(inFile, 'Variable Table') inLine = inFile.readline() while inLine.find('# ----') == -1: splitLine = inLine[2:].split('[') varNames.append(splitLine[0].strip()) qBounds.append(splitLine[1][:-2]) inLine = inFile.readline() for i in xrange(len(qBounds)): if qBounds[i] != '': l = qBounds[i].split(',') qBounds[i] = [] for item in l: qBounds[i].append(float(item)) else: qBounds[i] = [] return varNames, qBounds
def CharacteristicPolynomial(mol, mat=None): """ calculates the characteristic polynomial for a molecular graph if mat is not passed in, the molecule's Weighted Adjacency Matrix will be used. The approach used is the Le Verrier-Faddeev-Frame method described in _Chemical Graph Theory, 2nd Edition_ by Nenad Trinajstic (CRC Press, 1992), pg 76. """ nAtoms = mol.GetNumAtoms() if mat is None: # FIX: complete this: #A = mol.GetWeightedAdjacencyMatrix() pass else: A = mat I = 1. * numpy.identity(nAtoms) An = A res = numpy.zeros(nAtoms + 1, numpy.float) res[0] = 1.0 for n in xrange(1, nAtoms + 1): res[n] = 1. / n * numpy.trace(An) Bn = An - res[n] * I An = numpy.dot(A, Bn) res[1:] *= -1 return res
def TypeFinder(data, nRows, nCols, nullMarker=None): """ finds the types of the columns in _data_ if nullMarker is not None, elements of the data table which are equal to nullMarker will not count towards setting the type of their columns. """ priorities = {float: 3, int: 2, str: 1, -1: -1} res = [None] * nCols for col in xrange(nCols): typeHere = [-1, 1] for row in xrange(nRows): d = data[row][col] if d is None: continue locType = type(d) if locType != float and locType != int: locType = str try: d = str(d) except UnicodeError as msg: # pragma: nocover print('cannot convert text from row %d col %d to a string' % (row + 2, col)) print('\t>%s' % (repr(d))) raise UnicodeError(msg) else: typeHere[1] = max(typeHere[1], len(str(d))) if isinstance(d, string_types): if nullMarker is None or d != nullMarker: l = max(len(d), typeHere[1]) typeHere = [str, l] else: try: fD = float(int(d)) except OverflowError: # pragma: nocover locType = float else: if fD == d: locType = int if not isinstance(typeHere[0], string_types) and \ priorities[locType] > priorities[typeHere[0]]: typeHere[0] = locType res[col] = typeHere return res
def GetAtomicData(atomDict, descriptorsDesired, dBase=_atomDbName, table='atomic_data', where='', user='******', password='******', includeElCounts=0): """ pulls atomic data from a database **Arguments** - atomDict: the dictionary to populate - descriptorsDesired: the descriptors to pull for each atom - dBase: the DB to use - table: the DB table to use - where: the SQL where clause - user: the user name to use with the DB - password: the password to use with the DB - includeElCounts: if nonzero, valence electron count fields are added to the _atomDict_ """ extraFields = ['NVAL', 'NVAL_NO_FULL_F', 'NVAL_NO_FULL_D', 'NVAL_NO_FULL'] from rdkit.Dbase import DbModule cn = DbModule.connect(dBase, user, password) c = cn.cursor() descriptorsDesired = [s.upper() for s in descriptorsDesired] if 'NAME' not in descriptorsDesired: descriptorsDesired.append('NAME') if includeElCounts and 'CONFIG' not in descriptorsDesired: descriptorsDesired.append('CONFIG') for field in extraFields: if field in descriptorsDesired: descriptorsDesired.remove(field) toPull = ','.join(descriptorsDesired) command = 'select %s from atomic_data %s' % (toPull, where) try: c.execute(command) except Exception: print('Problems executing command:', command) return res = c.fetchall() for atom in res: tDict = {} for i in xrange(len(descriptorsDesired)): desc = descriptorsDesired[i] val = atom[i] tDict[desc] = val name = tDict['NAME'] atomDict[name] = tDict if includeElCounts: config = atomDict[name]['CONFIG'] atomDict[name]['NVAL'] = ConfigToNumElectrons(config) atomDict[name]['NVAL_NO_FULL_F'] = ConfigToNumElectrons(config, ignoreFullF=1) atomDict[name]['NVAL_NO_FULL_D'] = ConfigToNumElectrons(config, ignoreFullD=1) atomDict[name]['NVAL_NO_FULL'] = ConfigToNumElectrons(config, ignoreFullF=1, ignoreFullD=1)
def test4UnusedVars(self): " testing unused variables " self._setupTree1a() with open(self.qTree1Name,'rb') as inFile: t2 = cPickle.load(inFile) assert self.t1 == t2, 'Incorrect tree generated.' for i in xrange(len(self.examples1)): assert self.t1.ClassifyExample(self.examples1[i])==self.examples1[i][-1],\ 'examples1[%d] misclassified'%i
def test4UnusedVars(self): " testing unused variables " self._setupTree1a() with open(self.qTree1Name, 'rb') as inFile: t2 = cPickle.load(inFile) assert self.t1 == t2, 'Incorrect tree generated.' for i in xrange(len(self.examples1)): assert self.t1.ClassifyExample(self.examples1[i])==self.examples1[i][-1],\ 'examples1[%d] misclassified'%i
def testMultipleCalcs(self): " testing calculation of multiple descriptors " for i in xrange(len(self.cExprs)): cExpr = self.cExprs[i] argVect = self.piece1 + [cExpr] res = Parser.CalcMultipleCompoundsDescriptor([self.compos, self.compos], argVect, self.aDict, [self.pDict, self.pDict]) self.assertAlmostEqual(res[0], self.results[i], 2) self.assertAlmostEqual(res[1], self.results[i], 2)
def testMultipleCalcs(self): " testing calculation of multiple descriptors " for i in xrange(len(self.cExprs)): cExpr= self.cExprs[i] argVect = self.piece1 + [cExpr] res = Parser.CalcMultipleCompoundsDescriptor([self.compos,self.compos],argVect, self.aDict,[self.pDict,self.pDict]) self.assertAlmostEqual(res[0],self.results[i],2) self.assertAlmostEqual(res[1],self.results[i],2)
def __str__(self): """ provides a string representation of the network """ outStr = 'Network:\n' for i in xrange(len(self.nodeList)): outStr = outStr + '\tnode(% 3d):\n'%i outStr = outStr + '\t\tinputs: %s\n'%(str(self.nodeList[i].GetInputs())) outStr = outStr + '\t\tweights: %s\n'%(str(self.nodeList[i].GetWeights())) outStr = outStr + 'Total Number of Connections: % 4d'%self.nConnections return outStr
def testQuantize(self): " testing data quantization " qBounds = [[], [1, 2, 3]] examples = [["foo", 0], ["foo", 1.5], ["foo", 5.5], ["foo", 2.5]] answers = [["foo", 0], ["foo", 1], ["foo", 3], ["foo", 2]] nPoss = [0, 4] composite = Composite.Composite() composite.SetQuantBounds(qBounds, nPoss) for i in xrange(len(examples)): qEx = composite.QuantizeExample(examples[i]) self.assertEqual(qEx, answers[i])
def testQuantize(self): " testing data quantization " qBounds = [[], [1, 2, 3]] examples = [['foo', 0], ['foo', 1.5], ['foo', 5.5], ['foo', 2.5]] answers = [['foo', 0], ['foo', 1], ['foo', 3], ['foo', 2]] nPoss = [0, 4] composite = Composite.Composite() composite.SetQuantBounds(qBounds, nPoss) for i in xrange(len(examples)): qEx = composite.QuantizeExample(examples[i]) self.assertEqual(qEx, answers[i])
def _AddDataToDb(dBase, table, user, password, colDefs, colTypes, data, nullMarker=None, blockSize=100, cn=None): """ *For Internal Use* (drops and) creates a table and then inserts the values """ if not cn: cn = DbModule.connect(dBase, user, password) c = cn.cursor() try: c.execute('drop table %s' % (table)) except Exception: print('cannot drop table %s' % (table)) try: sqlStr = 'create table %s (%s)' % (table, colDefs) c.execute(sqlStr) except Exception: print('create table failed: ', sqlStr) print('here is the exception:') import traceback traceback.print_exc() return cn.commit() c = None block = [] entryTxt = [DbModule.placeHolder] * len(data[0]) dStr = ','.join(entryTxt) sqlStr = 'insert into %s values (%s)' % (table, dStr) nDone = 0 for row in data: entries = [None] * len(row) for col in xrange(len(row)): if row[col] is not None and \ (nullMarker is None or row[col] != nullMarker): if colTypes[col][0] == float: entries[col] = float(row[col]) elif colTypes[col][0] == int: entries[col] = int(row[col]) else: entries[col] = str(row[col]) else: entries[col] = None block.append(tuple(entries)) if len(block) >= blockSize: nDone += _insertBlock(cn, sqlStr, block) if not hasattr(cn, 'autocommit') or not cn.autocommit: cn.commit() block = [] if len(block): nDone += _insertBlock(cn, sqlStr, block) if not hasattr(cn, 'autocommit') or not cn.autocommit: cn.commit()
def _AddDataToDb(dBase,table,user,password,colDefs,colTypes,data, nullMarker=None,blockSize=100,cn=None): """ *For Internal Use* (drops and) creates a table and then inserts the values """ if not cn: cn = DbModule.connect(dBase,user,password) c = cn.cursor() try: c.execute('drop table %s'%(table)) except: print('cannot drop table %s'%(table)) try: sqlStr = 'create table %s (%s)'%(table,colDefs) c.execute(sqlStr) except: print('create table failed: ', sqlStr) print('here is the exception:') import traceback traceback.print_exc() return cn.commit() c = None block = [] entryTxt = [DbModule.placeHolder]*len(data[0]) dStr = ','.join(entryTxt) sqlStr = 'insert into %s values (%s)'%(table,dStr) nDone = 0 for row in data: entries = [None]*len(row) for col in xrange(len(row)): if row[col] is not None and \ (nullMarker is None or row[col] != nullMarker): if colTypes[col][0] == float: entries[col] = float(row[col]) elif colTypes[col][0] == int: entries[col] = int(row[col]) else: entries[col] = str(row[col]) else: entries[col] = None block.append(tuple(entries)) if len(block)>=blockSize: nDone += _insertBlock(cn,sqlStr,block) if not hasattr(cn,'autocommit') or not cn.autocommit: cn.commit() block = [] if len(block): nDone += _insertBlock(cn,sqlStr,block) if not hasattr(cn,'autocommit') or not cn.autocommit: cn.commit()
def testSimpleDescriptorCalc(self): " testing simple descriptor calculation " composList = ['Nb','Nb3','NbPt','Nb2Pt'] compare = [[2.32224798203, 0.0, 1.34000003338, 1.34000003338], [2.32224798203, 0.0, 1.34000003338, 1.34000003338], [1.51555249095, 0.806695491076, 1.34000003338, 1.29999995232], [1.78445098797, 0.717062658734, 1.34000003338, 1.29999995232]] for i in xrange(len(composList)): assert max(map(lambda x,y:abs(x-y),compare[i], self.desc.CalcSimpleDescriptorsForComposition(composList[i]))) < self.tol,\ 'Descriptor calculation failed'
def test4UnusedVars(self): " testing unused variables " self._setupTree1a() with open(self.qTree1Name, 'r') as inTFile: buf = inTFile.read().replace('\r\n', '\n').encode('utf-8') inTFile.close() with io.BytesIO(buf) as inFile: t2 = cPickle.load(inFile) assert self.t1 == t2, 'Incorrect tree generated.' for i in xrange(len(self.examples1)): assert self.t1.ClassifyExample(self.examples1[i])==self.examples1[i][-1],\ 'examples1[%d] misclassified'%i
def GetNamedData(self): """ returns a list of named examples **Note** a named example is the result of prepending the example name to the data list """ res = [None] * self.nPts for i in xrange(self.nPts): res[i] = [self.ptNames[i]] + self.data[i].tolist() return res
def ID3Boot(examples, attrs, nPossibleVals, initialVar=None, depth=0, maxDepth=-1, **kwargs): """ Bootstrapping code for the ID3 algorithm see ID3 for descriptions of the arguments If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ totEntropy = CalcTotalEntropy(examples, nPossibleVals) varTable = GenVarTable(examples, nPossibleVals, attrs) tree = DecTree.DecTreeNode(None, 'node') #tree.SetExamples(examples) tree._nResultCodes = nPossibleVals[-1] # <perl>you've got to love any language which will let you # do this much work in a single line :-)</perl> if initialVar is None: best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])] else: best = initialVar tree.SetName('Var: %d' % best) tree.SetData(totEntropy) tree.SetLabel(best) tree.SetTerminal(0) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) for val in xrange(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) tree.AddChildNode( ID3(nextExamples, best, nextAttrs, nPossibleVals, depth, maxDepth, **kwargs)) return tree
def GetTypeStrings(colHeadings,colTypes,keyCol=None): """ returns a list of SQL type strings """ typeStrs=[] for i in xrange(len(colTypes)): typ = colTypes[i] if typ[0] == float: typeStrs.append('%s double precision'%colHeadings[i]) elif typ[0] == int: typeStrs.append('%s integer'%colHeadings[i]) else: typeStrs.append('%s varchar(%d)'%(colHeadings[i],typ[1])) if colHeadings[i] == keyCol: typeStrs[-1] = '%s not null primary key'%(typeStrs[-1]) return typeStrs
def GetTypeStrings(colHeadings, colTypes, keyCol=None): """ returns a list of SQL type strings """ typeStrs = [] for i in xrange(len(colTypes)): typ = colTypes[i] if typ[0] == float: typeStrs.append('%s double precision' % colHeadings[i]) elif typ[0] == int: typeStrs.append('%s integer' % colHeadings[i]) else: typeStrs.append('%s varchar(%d)' % (colHeadings[i], typ[1])) if colHeadings[i] == keyCol: typeStrs[-1] = '%s not null primary key' % (typeStrs[-1]) return typeStrs
def GenRandomExamples(nVars=10,randScale=0.3,bitProb=0.5,nExamples=500,seed=(0,0), addResults=1): random.seed(seed[0]) varWeights = numpy.array([random.random() for x in range(nVars)])*randScale examples = [None]*nExamples for i in xrange(nExamples): varVals=[random.random()>bitProb for x in range(nVars)] temp = numpy.array(varVals) * varWeights res = sum(temp) if addResults: varVals.append(res>=1.) examples[i] = varVals nPossibleVals = [2]*(nExamples+1) attrs = list(range(nVars)) return (examples,attrs,nPossibleVals)
def ReadGeneralExamples(inFile): """ reads the examples from a .dat file **Arguments** - inFile: a file object **Returns** a 2-tuple containing: 1) the names of the examples 2) a list of lists containing the examples themselves **Note** - this attempts to convert variable values to ints, then floats. if those both fail, they are left as strings """ expr1 = re.compile(r'^#') expr2 = re.compile(r'[\ ]*|[\t]*') examples = [] names = [] inLine = inFile.readline() while inLine: if expr1.search(inLine) is None: resArr = expr2.split(inLine)[:-1] if len(resArr) > 1: for i in xrange(1, len(resArr)): d = resArr[i] try: resArr[i] = int(d) except ValueError: try: resArr[i] = float(d) except ValueError: pass examples.append(resArr[1:]) names.append(resArr[0]) inLine = inFile.readline() return names, examples
def ID3Boot(examples,attrs,nPossibleVals,initialVar=None,depth=0,maxDepth=-1, **kwargs): """ Bootstrapping code for the ID3 algorithm see ID3 for descriptions of the arguments If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ totEntropy = CalcTotalEntropy(examples,nPossibleVals) varTable = GenVarTable(examples,nPossibleVals,attrs) tree=DecTree.DecTreeNode(None,'node') #tree.SetExamples(examples) tree._nResultCodes = nPossibleVals[-1] # <perl>you've got to love any language which will let you # do this much work in a single line :-)</perl> if initialVar is None: best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])] else: best = initialVar tree.SetName('Var: %d'%best) tree.SetData(totEntropy) tree.SetLabel(best) tree.SetTerminal(0) nextAttrs = list(attrs) if not kwargs.get('recycleVars',0): nextAttrs.remove(best) for val in xrange(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) tree.AddChildNode(ID3(nextExamples,best,nextAttrs,nPossibleVals,depth,maxDepth, **kwargs)) return tree
def ReadGeneralExamples(inFile): """ reads the examples from a .dat file **Arguments** - inFile: a file object **Returns** a 2-tuple containing: 1) the names of the examples 2) a list of lists containing the examples themselves **Note** - this attempts to convert variable values to ints, then floats. if those both fail, they are left as strings """ expr1 = re.compile(r'^#') expr2 = re.compile(r'[\ ]*|[\t]*') examples = [] names = [] inLine = inFile.readline() while inLine: if expr1.search(inLine) is None: resArr = expr2.split(inLine)[:-1] if len(resArr)>1: for i in xrange(1,len(resArr)): d = resArr[i] try: resArr[i] = int(d) except ValueError: try: resArr[i] = float(d) except ValueError: pass examples.append(resArr[1:]) names.append(resArr[0]) inLine = inFile.readline() return names,examples
def FullyConnectNodes(self): """ Fully connects each layer in the network to the one above it **Note** this sets the connections, but does not assign weights """ nodeList = range(self.numInputNodes) nConnections = 0 for layer in xrange(self.numHiddenLayers): for i in self.layerIndices[layer+1]: self.nodeList[i].SetInputs(nodeList) nConnections = nConnections + len(nodeList) nodeList = self.layerIndices[layer+1] for i in self.layerIndices[-1]: self.nodeList[i].SetInputs(nodeList) nConnections = nConnections + len(nodeList) self.nConnections = nConnections
def FullyConnectNodes(self): """ Fully connects each layer in the network to the one above it **Note** this sets the connections, but does not assign weights """ nodeList = range(self.numInputNodes) nConnections = 0 for layer in xrange(self.numHiddenLayers): for i in self.layerIndices[layer + 1]: self.nodeList[i].SetInputs(nodeList) nConnections = nConnections + len(nodeList) nodeList = self.layerIndices[layer + 1] for i in self.layerIndices[-1]: self.nodeList[i].SetInputs(nodeList) nConnections = nConnections + len(nodeList) self.nConnections = nConnections
def CalcSimpleDescriptorsForComposition(self, compos='', composList=None): """ calculates all simple descriptors for a given composition **Arguments** - compos: a string representation of the composition - composList: a *composVect* The client must provide either _compos_ or _composList_. If both are provided, _composList_ takes priority. **Returns** the list of descriptor values **Notes** - when _compos_ is provided, this uses _chemutils.SplitComposition_ to split the composition into its individual pieces - if problems are encountered because of either an unknown descriptor or atom type, a _KeyError_ will be raised. """ if composList is None: composList = chemutils.SplitComposition(compos) try: res = [] for i in xrange(len(self.simpleList)): descName, targets = self.simpleList[i] for target in targets: try: method = getattr(self, target) except AttributeError: print('Method %s does not exist' % (target)) else: res.append(method(descName, composList)) except KeyError as msg: print('composition %s caused problems' % composList) raise KeyError(msg) return res
def _AdjustColHeadings(colHeadings, maxColLabelLen): """ *For Internal Use* removes illegal characters from column headings and truncates those which are too long. """ for i in xrange(len(colHeadings)): # replace unallowed characters and strip extra white space colHeadings[i] = colHeadings[i].strip() colHeadings[i] = colHeadings[i].replace(' ', '_') colHeadings[i] = colHeadings[i].replace('-', '_') colHeadings[i] = colHeadings[i].replace('.', '_') if len(colHeadings[i]) > maxColLabelLen: # interbase (at least) has a limit on the maximum length of a column name newHead = colHeadings[i].replace('_', '') newHead = newHead[:maxColLabelLen] print('\tHeading %s too long, changed to %s' % (colHeadings[i], newHead)) colHeadings[i] = newHead return colHeadings
def GetDescriptorNames(self): """ returns a list of the names of the descriptors this calculator generates """ if self.descriptorNames is not None: return self.descriptorNames else: res = [] for i in xrange(len(self.simpleList)): descName, targets = self.simpleList[i] for target in targets: try: method = getattr(self, target) except AttributeError: print('Method %s does not exist' % (target)) else: res.append('%s_%s' % (target, descName)) for entry in self.compoundList: res.append(entry[0]) self.descriptorNames = res[:] return tuple(res)