def setLearnerVars(self): if self.priorsGUI: self.priors = str(self.priorsGUI) else: self.priors = None self.optAlg = int(self.optAlgGUI) self.stopCrit = int(self.stopCritGUI) + 1 if miscUtilities.isNumber(self.stopValueGUI): if self.stopCrit == 1: #ITER self.maxIter = int(self.stopValueGUI) else: #EPS self.eps = float(self.stopValueGUI) else: self.warning(0, "Value for stop criteria must be a number! Using default value.") self.maxIter = AZOC.CVANNDEFAULTDICT["maxIter"] self.eps = AZOC.CVANNDEFAULTDICT["eps"] self.scaleData = bool(self.scaleDataGUI) self.scaleClass = bool(self.scaleClassGUI) # Transform the nHidden to a list try: self.nHidden = string.split(str(self.nHiddenGUI),",") self.nHidden = [int(elem.strip()) for elem in self.nHidden] except: self.warning(0, "Bad values for hidden neuronsi! Using default value.") self.nHidden = AZOC.CVANNDEFAULTDICT["nHidden"] self.setGUIVars()
def calcMahalanobis(data, testData): """ Calculates Mahalanobis distances. The data should only contain attributes that are relevant for similarity. OBS data is assumed to have a response variable. data - X matrix used to calculate the covariance matrix testData - the examples in an ExampleTable object for which to calculate the MDs Returns a list of Mahalanobis distances between the examples in testData and training data. The elements of the list are dictionaries, giving the Mahalanobis distances to the average (_MD), the nearest neighbor and an average of the 3 nearest neighbors (_train_av3nearest). """ # Impute any missing values averageImputer = orange.ImputerConstructor_average(data) data = averageImputer(data) averageImputer = orange.ImputerConstructor_average(testData) testData = averageImputer(testData) # Test if there is any non-numeric value within the dataset for ex in testData: # It is much faster to address the ex elements by their position instead of the correpondent name for idx in range(len(ex.domain.attributes)): if not miscUtilities.isNumber(ex[idx].value): raise Exception( "Cannot calculate Mahalanobis distances. The attribute '" + ex.domain.attributes[idx].name + "' has non-numeric values. Ex: " + str(ex[idx].value) ) # Create a trainingSet object. trainingSet = getTrainingSet(data) trainingset_descriptor_names = trainingSet.descr_names mahalanobisCalculator = Mahalanobis.MahalanobisDistanceCalculator(trainingSet) MDlist = [] for ex in testData: # Create a numeric vector from the example and assure the same order as in trainingset_descriptor_names descriptor_values = [] for name in trainingset_descriptor_names: try: descriptor_values.append(float(ex[name].value)) except: raise Exception("Not possible to calculate Mahalanobis distances. Some attribute is not numeric.") # descriptor_values = [1.5] * len(trainingset_descriptor_names) MD = mahalanobisCalculator.calculateDistances(descriptor_values, NO_OF_NEIGHBORS) MDlist.append(MD) return MDlist
def updateParametersFromTable(self): """Updates the parameters of the optimizer with the ones present on GUI table Returns True if all OK Returns False if Errors occurred """ #self.paramsNames: ["Name","Optimize","Lower Limit","Upper Limit","Distribution","Step","Default"] #self.comboDistItems: ["Continuous","Power2","By Step","Specific Values"] if self.paramsTable.columnCount() < 7: self.setErrors("Wrong number of columns in table!") return False RangePars,distType = [None,None] self.nParameters = len(self.parameters) for row in range(self.paramsTable.rowCount()): for col in range(7): if col == 0:#Name name=str(self.paramsTable.item(row,col).text()).strip() RangePars,distType = self.getRangeParsAndDistType(self.originalParameters,name) if RangePars==None or distType==None: self.setErrors("It was not possible to identify the range parameters") return False elif col == 1:#Optimize if self.paramsTable.cellWidget(row,col).checkState()==2: optimize=True else: optimize = False self.nParameters -= 1 elif col == 2:#Lower Limit Llimit = str(self.paramsTable.item(row,col).text()).strip() elif col == 3:#Upper Limit Ulimit = str(self.paramsTable.item(row,col).text()).strip() elif col == 4:#Distribution if "Combo" in str(type(self.paramsTable.cellWidget(row,col))): dist = str(self.paramsTable.cellWidget(row,col).currentIndex()) else: dist = self.comboDistItems.index(self.paramsTable.item(row,col).text()) elif col == 5:#Step step = str(self.paramsTable.item(row,col).text()).strip() elif col == 6:#Default if "Combo" in str(type(self.paramsTable.cellWidget(row,col))): default = self.paramsTable.cellWidget(row,col).currentIndex() if len(self.parameters[name][3]) <= default: #the parameter did not have an alias, the text is the value default = str(self.paramsTable.cellWidget(row,col).currentText()).strip() else: default = RangePars[default] else:#is string default = str(self.paramsTable.item(row,col).text()).strip() #['types.StringType', 'values', "['kernel' , 'pls1' , 'simpls']", ['Kernel', 'PLS1', 'SimPLS'], "'simpls'", True, False] if optimize: self.parameters[name][5] = True if "Combo" in str(type(self.paramsTable.cellWidget(row,4))): comboDistType = self.paramsTable.cellWidget(row,4).currentIndex() else: comboDistType = self.comboDistItems.index(self.paramsTable.item(row,4).text()) if comboDistType==0: #Continuous if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Lower limit",QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Upper limit",QMessageBox.Ok) return False self.parameters[name][1] = "interval" self.parameters[name][2] = "[" + Llimit + " , " + Ulimit + "]" self.parameters[name][3] = "" self.parameters[name][4] = default elif comboDistType==1: #Power2 if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Lower limit",QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Upper limit",QMessageBox.Ok) return False if not miscUtilities.isNumber(step): QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has an invalid step value",QMessageBox.Ok) return False self.parameters[name][1] = "values" self.parameters[name][2] = "miscUtilities.power2Range(" + Llimit + "," + Ulimit + "," + step + ")" self.parameters[name][3] = "" self.parameters[name][4] = default elif comboDistType==2: #By Step if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Lower limit",QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has invalid Upper limit",QMessageBox.Ok) return False if not miscUtilities.isNumber(step): QMessageBox.warning(self,"Invalid parameter","Parameter "+name+" has an invalid step value",QMessageBox.Ok) return False self.parameters[name][1] = "values" self.parameters[name][2] = "miscUtilities.Range(" + Llimit + "," + Ulimit + "," + step + ")" self.parameters[name][3] = "" self.parameters[name][4] = default else: #Specific Values #The 'Specific Values' refere to the parameters specified in the original AZLearnersParamsConfig.py file self.parameters[name][1] = self.originalParameters[name][1] self.parameters[name][2] = self.originalParameters[name][2] self.parameters[name][3] = self.originalParameters[name][3] self.parameters[name][4] = default else: self.parameters[name][5] = False #self.parameters[name][1]='values' if "Combo" in str(type(self.paramsTable.cellWidget(row,6))): default = self.paramsTable.cellWidget(row,6).currentIndex() if len(RangePars) <= default: #the parameter did not have an alias, the text is the value #self.parameters[name][2]='[' + str(self.paramsTable.cellWidget(row,6).currentText()).strip() + ']' self.parameters[name][4]=str(self.paramsTable.cellWidget(row,6).currentText()).strip() else: #self.parameters[name][2]='[' + RangePars[default] + ']' self.parameters[name][4]= RangePars[default] else:#is string #self.parameters[name][2]='[' + str(self.paramsTable.item(row,6).text()).strip() + ']' self.parameters[name][4]=str(self.paramsTable.item(row,6).text()).strip() return True
def getNearestNeighbors(query, n, NNDataPath, FPPath=None, resPath=None, idx=0): """ get the n nearest neighbors query: bin string with query fingerprint returns an ordered list with the n top neighbors (each one in a dict): [ { "id" : ID, "expVal" : ExpValues, "similarity" : TanimotoSimilarity, "smi" : smiles, "imgPath" : imgPath, "MeanInhib" : Mean Inhib. }, ... ] It will saves the images in resPath: NN_1.png #1 neighbor NN_2.png #2 neighbor ... NN_n.png #n neighbor """ if not query or not n or not NNDataPath or not FPPath: return [] #if resPath and not os.path.isdir(resPath): # os.makedirs(resPath) # get the correct header file = open(NNDataPath, "r") header = file.readline().strip().split('\t') file.close() if "Molecule SMILES" not in header or "Compound Name" not in header: print "NN dataset ", NNDataPath, " have not the correct header. It must contain 'Molecule SMILES' and 'Compound Name' attributes." return [] # Index will have to be sum 1 because the TS will be prepended idxID = header.index("Compound Name") + 1 idxExpVal = len(header) idxSMILES = header.index("Molecule SMILES") + 1 idxSimilarity = 0 Nbits = 2048 cmdStr = 'echo "' + query + '" | fpin ' + FPPath + " " + NNDataPath + ' 0.0 ' + str( n) status, output = commands.getstatusoutput(cmdStr) if status: print status print output raise Exception(str(output)) # TS SMILES AZID DATE expRes # output = "0.7117 CCCC(C)C1(C(=O)NC(=O)NC1=O)CC AZ10046012 2009-12-02 3.480007" TS = [] for ts in output.split("\n"): TS.append(ts.strip().split('\t')) # in TS: # TS[n][0] - tanimoto similarity # TS[n][1] - SMILES # TS[n][2] - AZID # TS[n][-1]- expRes res = [] timeStamp = str(time.time()).replace(".", '') for fidx, nn in enumerate(TS): ID = nn[idxID] if miscUtilities.isNumber(nn[idxExpVal]): expVal = str(round(float(nn[idxExpVal]), 2)) else: expVal = nn[idxExpVal] SMILES = nn[idxSMILES] if resPath and os.path.isdir(resPath): imgPath = os.path.join( resPath, "NN" + str(idx) + "_" + str(fidx + 1) + "_" + timeStamp + ".png") mol = Chem.MolFromSmiles(SMILES) # save the respective imgPath... Draw.MolToImageFile(mol, imgPath, size=(300, 300), kekulize=True, wedgeBonds=True) else: imgPath = "" res.append({ "id": ID, "expVal": expVal, "similarity": nn[idxSimilarity], "smi": SMILES, "imgPath": imgPath, "MeanInhib": '' }) return res
def calcMahalanobis(data, testData, invCovMatFile = None, centerFile = None, dataTableFile = None, domain = None, nNN = NO_OF_NEIGHBORS): """ Calculates Mahalanobis distances. The data should only contain attributes that are relevant for similarity. OBS data is assumed to have a response variable. data - X matrix used to calculate the covariance matrix testData - the examples in an ExampleTable object for which to calculate the MDs Returns a list of Mahalanobis distances between the examples in testData and training data. The elements of the list are dictionaries, giving the Mahalanobis distances to the average (_MD), the nearest neighbor and an average of the 3 nearest neighbors (_train_av3nearest). """ # Impute any missing valuesi if data: averageImputer = orange.ImputerConstructor_average(data) data = averageImputer(data) #If Class is continuous and all class values are unknown (and they usually are in ex to predict), the imputer cannot be created. # Since we are only using attributes, not the class, we will assign 0 to the class values in order to impute the testData if testData.domain.classVar and testData.domain.classVar.varType == orange.VarTypes.Continuous: for ex in testData: if ex.getclass().isSpecial(): ex.setclass(0) # This can also happens when calculating a single example with missing attributes try: averageImputer = orange.ImputerConstructor_average(testData) except: for ex in testData: for attr in [a for a in testData.domain.attributes if a.varType == orange.VarTypes.Continuous]: if ex[attr].isSpecial(): ex[attr] = 0 averageImputer = orange.ImputerConstructor_average(testData) testData = averageImputer(testData) #Test if there is any non-numeric value within the dataset for ex in testData: #It is much faster to address the ex elements by their position instead of the correpondent name for idx in range(len(ex.domain.attributes)): if not miscUtilities.isNumber(ex[idx].value): raise Exception("Cannot calculate Mahalanobis distances. The attribute '" + \ ex.domain.attributes[idx].name + "' has non-numeric values. Ex: " + \ str(ex[idx].value)) if data: trainingSet = getTrainingSet(data) trainingset_descriptor_names = trainingSet.descr_names else: trainingSet = None trainingset_descriptor_names = [attr.name for attr in domain.attributes] mahalanobisCalculator = Mahalanobis.MahalanobisDistanceCalculator(trainingSet,invCovMatFile,centerFile,dataTableFile) MDlist = [] for ex in testData: # Create a numeric vector from the example and assure the same order as in trainingset_descriptor_names descriptor_values = [] for name in trainingset_descriptor_names: try: descriptor_values.append(float(ex[name].value)) except: raise Exception("Not possible to calculate Mahalanobis distances. Some attribute is not numeric.") #descriptor_values = [1.5] * len(trainingset_descriptor_names) MD = mahalanobisCalculator.calculateDistances(descriptor_values, nNN) MDlist.append(MD) return MDlist
def __init__(self, **kwds): self.verbose = 0 self.logFile = None self.resultsFile = None self.nExtFolds = 5 self.nInnerFolds = 5 self.data = None self.learner = None self.paramList = None self.queueType = "NoSGE" self.responseType = None self.fixedParams = {} self.testAttrFilter = None self.testFilterVal = None self.sampler = dataUtilities.SeedDataSampler # Append arguments to the __dict__ member variable self.__dict__.update(kwds) self.learnerName = "" self.preDefIndices = orange.LongList() self.usePreDefFolds = False self.useVarCtrlCV = False if self.testAttrFilter and self.testAttrFilter in self.data.domain: if self.testFilterVal and type(self.testFilterVal) == list and type(self.testAttrFilter) == str: self.useVarCtrlCV = True self.usePreDefFolds = False for ex in self.data: if ex[self.testAttrFilter].value in self.testFilterVal: # Compound selected to be allowed in the test set self.preDefIndices.append(1) else: # Compound to not include in the test set. Always to be shifted to the train self.preDefIndices.append(0) elif self.testFilterVal is None: self.usePreDefFolds = True self.useVarCtrlCV = False #Enable pre-selected-indices ( index 0 will be set for train Bias) foldsCounter = {} for ex in self.data: value = str(ex[self.testAttrFilter].value) if not miscUtilities.isNumber(value): self.__log("Invalid fold value:"+str(value)+". It must be str convertable to an int.") return False value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 self.preDefIndices.append(value) self.__log( "INFO: Pre-selected "+str(len([f for f in foldsCounter.keys() if f != 0]))+" folds for CV:") self.__log( " Examples in data: "+str(sum(foldsCounter.values()))) self.__log( " Examples selected for validation: "+str(sum([foldsCounter[f] for f in foldsCounter if f != 0]))) self.__log( " Examples to be appended to the train set: "+ str(0 in foldsCounter.keys() and foldsCounter[0] or 0)) else: self.__log("ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: " + str(self.testAttrFilter)) return False self.data = dataUtilities.attributeDeselectionData(self.data, [self.testAttrFilter]) else: self.usePreDefFolds = False self.useVarCtrlCV = False self.testAttrFilter = None self.testFilterVal = None
def calcMahalanobis(data, testData, invCovMatFile=None, centerFile=None, dataTableFile=None, domain=None, nNN=NO_OF_NEIGHBORS): """ Calculates Mahalanobis distances. The data should only contain attributes that are relevant for similarity. OBS data is assumed to have a response variable. data - X matrix used to calculate the covariance matrix testData - the examples in an ExampleTable object for which to calculate the MDs Returns a list of Mahalanobis distances between the examples in testData and training data. The elements of the list are dictionaries, giving the Mahalanobis distances to the average (_MD), the nearest neighbor and an average of the 3 nearest neighbors (_train_av3nearest). """ # Impute any missing valuesi if data: averageImputer = orange.ImputerConstructor_average(data) data = averageImputer(data) #If Class is continuous and all class values are unknown (and they usually are in ex to predict), the imputer cannot be created. # Since we are only using attributes, not the class, we will assign 0 to the class values in order to impute the testData if testData.domain.classVar and testData.domain.classVar.varType == orange.VarTypes.Continuous: for ex in testData: if ex.getclass().isSpecial(): ex.setclass(0) # This can also happens when calculating a single example with missing attributes try: averageImputer = orange.ImputerConstructor_average(testData) except: for ex in testData: for attr in [ a for a in testData.domain.attributes if a.varType == orange.VarTypes.Continuous ]: if ex[attr].isSpecial(): ex[attr] = 0 averageImputer = orange.ImputerConstructor_average(testData) testData = averageImputer(testData) #Test if there is any non-numeric value within the dataset for ex in testData: #It is much faster to address the ex elements by their position instead of the correpondent name for idx in range(len(ex.domain.attributes)): if not miscUtilities.isNumber(ex[idx].value): raise Exception("Cannot calculate Mahalanobis distances. The attribute '" + \ ex.domain.attributes[idx].name + "' has non-numeric values. Ex: " + \ str(ex[idx].value)) if data: trainingSet = getTrainingSet(data) trainingset_descriptor_names = trainingSet.descr_names else: trainingSet = None trainingset_descriptor_names = [ attr.name for attr in domain.attributes ] mahalanobisCalculator = Mahalanobis.MahalanobisDistanceCalculator( trainingSet, invCovMatFile, centerFile, dataTableFile) MDlist = [] for ex in testData: # Create a numeric vector from the example and assure the same order as in trainingset_descriptor_names descriptor_values = [] for name in trainingset_descriptor_names: try: descriptor_values.append(float(ex[name].value)) except: raise Exception( "Not possible to calculate Mahalanobis distances. Some attribute is not numeric." ) #descriptor_values = [1.5] * len(trainingset_descriptor_names) MD = mahalanobisCalculator.calculateDistances(descriptor_values, nNN) MDlist.append(MD) return MDlist
def updateParametersFromTable(self): """Updates the parameters of the optimizer with the ones present on GUI table Returns True if all OK Returns False if Errors occurred """ #self.paramsNames: ["Name","Optimize","Lower Limit","Upper Limit","Distribution","Step","Default"] #self.comboDistItems: ["Continuous","Power2","By Step","Specific Values"] if self.paramsTable.columnCount() < 7: self.setErrors("Wrong number of columns in table!") return False RangePars, distType = [None, None] self.nParameters = len(self.parameters) for row in range(self.paramsTable.rowCount()): for col in range(7): if col == 0: #Name name = str(self.paramsTable.item(row, col).text()).strip() RangePars, distType = self.getRangeParsAndDistType( self.originalParameters, name) if RangePars == None or distType == None: self.setErrors( "It was not possible to identify the range parameters" ) return False elif col == 1: #Optimize if self.paramsTable.cellWidget(row, col).checkState() == 2: optimize = True else: optimize = False self.nParameters -= 1 elif col == 2: #Lower Limit Llimit = str(self.paramsTable.item(row, col).text()).strip() elif col == 3: #Upper Limit Ulimit = str(self.paramsTable.item(row, col).text()).strip() elif col == 4: #Distribution if "Combo" in str( type(self.paramsTable.cellWidget(row, col))): dist = str( self.paramsTable.cellWidget(row, col).currentIndex()) else: dist = self.comboDistItems.index( self.paramsTable.item(row, col).text()) elif col == 5: #Step step = str(self.paramsTable.item(row, col).text()).strip() elif col == 6: #Default if "Combo" in str( type(self.paramsTable.cellWidget(row, col))): default = self.paramsTable.cellWidget( row, col).currentIndex() if len( self.parameters[name][3] ) <= default: #the parameter did not have an alias, the text is the value default = str( self.paramsTable.cellWidget( row, col).currentText()).strip() else: default = RangePars[default] else: #is string default = str(self.paramsTable.item( row, col).text()).strip() #['types.StringType', 'values', "['kernel' , 'pls1' , 'simpls']", ['Kernel', 'PLS1', 'SimPLS'], "'simpls'", True, False] if optimize: self.parameters[name][5] = True if "Combo" in str(type(self.paramsTable.cellWidget(row, 4))): comboDistType = self.paramsTable.cellWidget( row, 4).currentIndex() else: comboDistType = self.comboDistItems.index( self.paramsTable.item(row, 4).text()) if comboDistType == 0: #Continuous if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Lower limit", QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Upper limit", QMessageBox.Ok) return False self.parameters[name][1] = "interval" self.parameters[name][ 2] = "[" + Llimit + " , " + Ulimit + "]" self.parameters[name][3] = "" self.parameters[name][4] = default elif comboDistType == 1: #Power2 if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Lower limit", QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Upper limit", QMessageBox.Ok) return False if not miscUtilities.isNumber(step): QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has an invalid step value", QMessageBox.Ok) return False self.parameters[name][1] = "values" self.parameters[name][ 2] = "miscUtilities.power2Range(" + Llimit + "," + Ulimit + "," + step + ")" self.parameters[name][3] = "" self.parameters[name][4] = default elif comboDistType == 2: #By Step if not miscUtilities.isNumber(Llimit): if "N_EX" not in Llimit and "N_ATTR" not in Llimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Lower limit", QMessageBox.Ok) return False if not miscUtilities.isNumber(Ulimit): if "N_EX" not in Ulimit and "N_ATTR" not in Ulimit: QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has invalid Upper limit", QMessageBox.Ok) return False if not miscUtilities.isNumber(step): QMessageBox.warning( self, "Invalid parameter", "Parameter " + name + " has an invalid step value", QMessageBox.Ok) return False self.parameters[name][1] = "values" self.parameters[name][ 2] = "miscUtilities.Range(" + Llimit + "," + Ulimit + "," + step + ")" self.parameters[name][3] = "" self.parameters[name][4] = default else: #Specific Values #The 'Specific Values' refere to the parameters specified in the original AZLearnersParamsConfig.py file self.parameters[name][1] = self.originalParameters[name][1] self.parameters[name][2] = self.originalParameters[name][2] self.parameters[name][3] = self.originalParameters[name][3] self.parameters[name][4] = default else: self.parameters[name][5] = False #self.parameters[name][1]='values' if "Combo" in str(type(self.paramsTable.cellWidget(row, 6))): default = self.paramsTable.cellWidget(row, 6).currentIndex() if len( RangePars ) <= default: #the parameter did not have an alias, the text is the value #self.parameters[name][2]='[' + str(self.paramsTable.cellWidget(row,6).currentText()).strip() + ']' self.parameters[name][4] = str( self.paramsTable.cellWidget( row, 6).currentText()).strip() else: #self.parameters[name][2]='[' + RangePars[default] + ']' self.parameters[name][4] = RangePars[default] else: #is string #self.parameters[name][2]='[' + str(self.paramsTable.item(row,6).text()).strip() + ']' self.parameters[name][4] = str( self.paramsTable.item(row, 6).text()).strip() return True
def getExamplesAndSetTrainBias(self, data, testAttrFilter, testFilterVal): """ Collects and returns the examples that match the filterValue at the Attr defined The remaining examples (that do not match the filterValue at the Attr defined) are placed in the trainBias to be added in all train events. """ self.trainBias = None if testAttrFilter is not None and testFilterVal is not None and testAttrFilter in data.domain: if type(testFilterVal) != list: raise Exception("Invalid Attr filter value. It must be a list of strings") else: allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) for ex in data: inExamples = False for Vfilter in testFilterVal: if ex[testAttrFilter].value == Vfilter: examples.append(ex) inExamples = True break if not inExamples: self.trainBias.append(ex) print "INFO: Variable control validation:" print " Examples in data: "+str(allDataEx) print " Examples selected for validation: "+str(len(examples)) print " Examples to be appended to the train set: "+str(len(self.trainBias)) examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter]) elif testAttrFilter is not None and testFilterVal is None and testAttrFilter in data.domain: #Enable pre-selected-indices self.fixedIdx = orange.LongList() allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) foldsCounter = {} for ex in data: value = str(ex[testAttrFilter].value) if not miscUtilities.isNumber(value): raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.") value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 if not miscUtilities.isNumber: raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.") if value != 0: examples.append(ex) self.fixedIdx.append(value - 1) else: self.trainBias.append(ex) print "INFO: Pre-selected "+str(len([f for f in foldsCounter if f != 0]))+" folds for CV:" print " Examples in data: "+str(allDataEx) print " Examples selected for validation: "+str(len(examples)) print " Examples to be appended to the train set: "+str(len(self.trainBias)) examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter]) else: examples = data return examples
def getNearestNeighbors(query, n, NNDataPath, FPPath = None, resPath = None, idx = 0): """ get the n nearest neighbors query: bin string with query fingerprint returns an ordered list with the n top neighbors (each one in a dict): [ { "id" : ID, "expVal" : ExpValues, "similarity" : TanimotoSimilarity, "smi" : smiles, "imgPath" : imgPath, "MeanInhib" : Mean Inhib. }, ... ] It will saves the images in resPath: NN_1.png #1 neighbor NN_2.png #2 neighbor ... NN_n.png #n neighbor """ if not query or not n or not NNDataPath or not FPPath: return [] #if resPath and not os.path.isdir(resPath): # os.makedirs(resPath) # get the correct header file = open(NNDataPath,"r") header = file.readline().strip().split('\t') file.close() if "Molecule SMILES" not in header or "Compound Name" not in header: print "NN dataset ",NNDataPath, " have not the correct header. It must contain 'Molecule SMILES' and 'Compound Name' attributes." return [] # Index will have to be sum 1 because the TS will be prepended idxID = header.index("Compound Name") + 1 idxExpVal = len(header) idxSMILES = header.index("Molecule SMILES") + 1 idxSimilarity = 0 Nbits = 2048 cmdStr = 'echo "' + query + '" | fpin ' + FPPath + " " +NNDataPath + ' 0.0 '+str(n) status,output = commands.getstatusoutput(cmdStr) if status: print status print output raise Exception(str(output)) # TS SMILES AZID DATE expRes # output = "0.7117 CCCC(C)C1(C(=O)NC(=O)NC1=O)CC AZ10046012 2009-12-02 3.480007" TS=[] for ts in output.split("\n"): TS.append(ts.strip().split('\t')) # in TS: # TS[n][0] - tanimoto similarity # TS[n][1] - SMILES # TS[n][2] - AZID # TS[n][-1]- expRes res = [] timeStamp=str(time.time()).replace(".",'') for fidx,nn in enumerate(TS): ID= nn[idxID] if miscUtilities.isNumber(nn[idxExpVal]): expVal = str(round(float(nn[idxExpVal]),2)) else: expVal = nn[idxExpVal] SMILES = nn[idxSMILES] if resPath and os.path.isdir(resPath): imgPath = os.path.join(resPath,"NN"+str(idx)+"_"+str(fidx+1)+"_"+timeStamp+".png") mol = Chem.MolFromSmiles(SMILES) # save the respective imgPath... Draw.MolToImageFile(mol,imgPath,size=(300, 300), kekulize=True, wedgeBonds=True) else: imgPath = "" res.append( { "id": ID, "expVal": expVal, "similarity": nn[idxSimilarity], "smi": SMILES, "imgPath": imgPath, "MeanInhib": ''} ) return res
def __init__(self, **kwds): self.verbose = 0 self.logFile = None self.resultsFile = None self.nExtFolds = 5 self.nInnerFolds = 5 self.data = None self.learner = None self.paramList = None self.queueType = "NoSGE" self.responseType = None self.fixedParams = {} self.testAttrFilter = None self.testFilterVal = None self.sampler = dataUtilities.SeedDataSampler # Append arguments to the __dict__ member variable self.__dict__.update(kwds) self.learnerName = "" self.preDefIndices = orange.LongList() self.usePreDefFolds = False self.useVarCtrlCV = False if self.testAttrFilter and self.testAttrFilter in self.data.domain: if self.testFilterVal and type( self.testFilterVal) == list and type( self.testAttrFilter) == str: self.useVarCtrlCV = True self.usePreDefFolds = False for ex in self.data: if ex[self. testAttrFilter].value in self.testFilterVal: # Compound selected to be allowed in the test set self.preDefIndices.append(1) else: # Compound to not include in the test set. Always to be shifted to the train self.preDefIndices.append(0) elif self.testFilterVal is None: self.usePreDefFolds = True self.useVarCtrlCV = False #Enable pre-selected-indices ( index 0 will be set for train Bias) foldsCounter = {} for ex in self.data: value = str(ex[self.testAttrFilter].value) if not miscUtilities.isNumber(value): self.__log("Invalid fold value:" + str(value) + ". It must be str convertable to an int.") return False value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 self.preDefIndices.append(value) self.__log("INFO: Pre-selected " + str(len([f for f in foldsCounter.keys() if f != 0])) + " folds for CV:") self.__log(" Examples in data: " + str(sum(foldsCounter.values()))) self.__log( " Examples selected for validation: " + str(sum([foldsCounter[f] for f in foldsCounter if f != 0]))) self.__log( " Examples to be appended to the train set: " + str(0 in foldsCounter.keys() and foldsCounter[0] or 0)) else: self.__log( "ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: " + str(self.testAttrFilter)) return False self.data = dataUtilities.attributeDeselectionData( self.data, [self.testAttrFilter]) else: self.usePreDefFolds = False self.useVarCtrlCV = False self.testAttrFilter = None self.testFilterVal = None