def testQuantPickle(self): # " testing QuantDataSet pickling " self.setUpQuantLoad() DataUtils.WritePickledData( RDConfig.RDCodeDir + '/ML/Data/test_data/testquant.qdat.pkl', self.d) with open(RDConfig.RDCodeDir + '/ML/Data/test_data/testquant.qdat.pkl', 'rb') as f: vNames = pickle.load(f) qBounds = pickle.load(f) ptNames = pickle.load(f) examples = pickle.load(f) d = MLData.MLQuantDataSet(examples, varNames=vNames, qBounds=qBounds, ptNames=ptNames) assert self.d.GetNPts() == d.GetNPts(), 'nPts wrong' assert self.d.GetNVars() == d.GetNVars(), 'nVars wrong' assert self.d.GetNResults() == d.GetNResults(), 'nResults wrong' assert self.d.GetVarNames() == d.GetVarNames(), 'varNames wrong' assert self.d.GetPtNames() == d.GetPtNames(), 'ptNames wrong' assert self.d.GetNPossibleVals() == d.GetNPossibleVals( ), 'nPossible Wrong' assert self.d.GetQuantBounds() == d.GetQuantBounds( ), 'quantBounds Wrong' assert self.d.GetResults() == d.GetResults(), 'GetResults wrong' assert self.d.GetAllData()[1] == d.GetAllData()[1], 'GetAllData wrong' assert self.d.GetInputData()[3] == d.GetInputData( )[3], 'GetInputData wrong' assert self.d.GetNamedData()[2] == d.GetNamedData( )[2], 'GetNamedData wrong'
def _setupMultiTree(self): examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2], [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]] data = MLData.MLQuantDataSet(examples) attrs = range(0, data.GetNVars()) t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals()) self.t1 = t1 self.examples = examples
def DBToData(dbName, tableName, user='******', password='******', dupCol=-1, what='*', where='', join='', pickleCol=-1, pickleClass=None, ensembleIds=None): """ constructs an _MLData.MLDataSet_ from a database **Arguments** - dbName: the name of the database to be opened - tableName: the table name containing the data in the database - user: the user name to be used to connect to the database - password: the password to be used to connect to the database - dupCol: if nonzero specifies which column should be used to recognize duplicates. **Returns** an _MLData.MLDataSet_ **Notes** - this uses Dbase.DataUtils functionality """ conn = DbConnect(dbName, tableName, user, password) res = conn.GetData(fields=what, where=where, join=join, removeDups=dupCol, forceList=1) nPts = len(res) vals = [None] * nPts ptNames = [None] * nPts classWorks = True for i in range(nPts): tmp = list(res[i]) ptNames[i] = tmp.pop(0) if pickleCol >= 0: if not pickleClass or not classWorks: tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) else: try: tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) except Exception: tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) classWorks = False if ensembleIds: tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol], ensembleIds) else: if ensembleIds: tmp = TakeEnsemble(tmp, ensembleIds, isDataVect=True) vals[i] = tmp varNames = conn.GetColumnNames(join=join, what=what) data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames) return data
def _setupPyMultiTree(self): from rdkit.ML.InfoTheory import entropy ID3.entropy.InfoEntropy = entropy.PyInfoEntropy ID3.entropy.InfoGain = entropy.PyInfoGain examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2], [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]] data = MLData.MLQuantDataSet(examples) attrs = range(0, data.GetNVars()) t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals()) self.t1 = t1 self.examples = examples
def TextToData(reader, ignoreCols=[], onlyCols=None): """ constructs an _MLData.MLDataSet_ from a bunch of text #DOC **Arguments** - reader needs to be iterable and return lists of elements (like a csv.reader) **Returns** an _MLData.MLDataSet_ """ varNames = next(reader) if not onlyCols: keepCols = [] for i, name in enumerate(varNames): if name not in ignoreCols: keepCols.append(i) else: keepCols = [-1] * len(onlyCols) for i, name in enumerate(varNames): if name in onlyCols: keepCols[onlyCols.index(name)] = i nCols = len(varNames) varNames = tuple([varNames[x] for x in keepCols]) nVars = len(varNames) vals = [] ptNames = [] for splitLine in reader: if len(splitLine): if len(splitLine) != nCols: raise ValueError('unequal line lengths') tmp = [splitLine[x] for x in keepCols] ptNames.append(tmp[0]) pt = [None] * (nVars - 1) for j in range(nVars - 1): try: val = int(tmp[j + 1]) except ValueError: try: val = float(tmp[j + 1]) except ValueError: val = str(tmp[j + 1]) pt[j] = val vals.append(pt) data = MLData.MLDataSet(vals, varNames=varNames, ptNames=ptNames) return data
def BuildDataSet(fileName): """ builds a data set from a .dat file **Arguments** - fileName: the name of the .dat file **Returns** an _MLData.MLDataSet_ """ with open(fileName, 'r') as inFile: varNames, qBounds = ReadVars(inFile) ptNames, examples = ReadGeneralExamples(inFile) data = MLData.MLDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames) return data
def BuildQuantDataSet(fileName): """ builds a data set from a .qdat file **Arguments** - fileName: the name of the .qdat file **Returns** an _MLData.MLQuantDataSet_ """ inFile = open(fileName, 'r') varNames, qBounds = ReadVars(inFile) ptNames, examples = ReadQuantExamples(inFile) data = MLData.MLQuantDataSet(examples, qBounds=qBounds, varNames=varNames, ptNames=ptNames) return data