def cleanJob(): jobid = request.form['jobName'] jobDir, jobFn = os.path.split(jobid) job = jobHandler.jobCpd() job.load(jobFn) if job.dirTemp is not None: # if cpd: ceck in all mains. if only one, clean from main #1 get list of screenings with cpd listScreens = jobHandler.jobsWithCpd(job.filename) # if more than one: stop if len(listScreens) > 1: return 'Multiple screening are using this job. It won\'t be removed. <br> <a href="/">Go to the initial page</a>' elif len(listScreens) == 1: # Update screening job screening = jobHandler.jobCpd() screening.load(listScreens[0]) screening.results = [ x for x in screening.results if x['JobName'] != os.path.join(jobDir, jobFn) ] # clean from main job screening.update() #clean mol folder and index job.remove() else: for item in job.results: # check if used by other jobs, in case rmove job cpd. jobCpd = jobHandler.jobCpd() jobCpd.load(item['JobName']) listScreens = jobHandler.jobsWithCpd(jobCpd.filename) # if more than one: do not remove if len(listScreens) == 1: jobCpd.remove() job.remove() return '%s, %s REMOVED <br> <a href="/jobs">Go to the list jobs page</a>' % ( jobDir, jobFn)
def getFullTrain(trainModel, jobResults): datasetInfo = [] listPoses = [] i = 0 for cpd in jobResults: i += 1 cpdInfo = {} TSfound = False for j in range(len(trainModel)): if cpd['smi'] == trainModel[j]['smi']: TSfound = True break cpdInfo['id'] = i cpdInfo['smi'] = cpd['smi'] cpdInfo['DGexp'] = float(cpd['DGexp']) cpdInfo['JobFile'] = cpd['JobName'] cpdInfo['Status'] = cpd['Status'] if TSfound: cpdInfo['DGcalc'] = float(trainModel[j]['Gcalc']) wi = ['%.3f' % x for x in trainModel[j]['wi']] cpdInfo['wi'] = ', '.join(wi) idposes = ['%02d' % x for x in trainModel[j]['idsims']] cpdInfo['idSims'] = ', '.join(idposes) else: cpdInfo['DGcalc'] = '' cpdInfo['wi'] = '' cpdInfo['idSims'] = '' cpdInfo['CI_decEle'] = '' cpdInfo['CI_decVdw'] = '' cpdInfo['CI_Tanimoto'] = '' cpdInfo['CI_Dene'] = '' cpdInfo['CI_Yrange'] = '' datasetInfo.append(cpdInfo) ### Try to extract starting pose files try: jobcpd = jobHandler.jobCpd() jobcpd.load(cpdInfo['JobFile']) completed, cpdPoses = getPoses(jobcpd.dirTemp) if completed: listPoses.append({'cpd': i, 'poses': cpdPoses}) except Exception, e: logging.debug('Error in loading poses files, %s' % e)
def predictWorkflow(self, molecules, detail, progress): # output of the fuction should be: # list of (True, (molPR,molAD,molCI)) for each compound # output for each compound is a list of: # (molPR,molAD,molCI) # molPR: (True, float with predicted y) # molAD: (True, integer-sum of violations) # molCI: (True, float- SDEP and violations based value) launchDir = os.getcwd() logging.info('WORKING DIRECTORY IS %s' % launchDir) statEndJob = ['FAILED', 'CANCELLED', 'DONE'] completed = False modelId = {'modelProt': self.protein, 'modelProtVer': self.version} # Check existence file input and check number of molecules for preliminary 'Failed' result setting nmols = 0 if os.path.isfile(molecules): with open(molecules, 'r') as infile: sdf = infile.read() for line in sdf.splitlines(): if line == '$$$$': nmols += 1 listPreds = [[False, [(False, 0), (False, 0), (False, 0)]] for i in range(nmols)] try: newSdfFn = molecules prediction = True fieldExp = '' # Submit job success, results = submitScreen(newSdfFn, modelId, prediction, etoxlie_folder=etoxlie_folder, fieldExp=str(fieldExp), jobid=self.jobid) #if success results is the filename id if not success: raise Exception('Submission failed: %s' % results) # Load job as class jobFN = results jobScreen = jobHandler.jobCpd() jobScreen.load(jobFN) # Check job status while jobScreen.status not in statEndJob: updated, status, results = screenSDF(jobScreen.results, jobScreen.prediction, jobScreen.modelProt, jobScreen.modelProtVer, jobScreen.experiment) if updated: jobScreen.status = status jobScreen.results = results jobScreen.update() sleep(10) # if it is finished, get the results in the proper format for i, cpd in enumerate(jobScreen.results): doneCpd = False molPR = (False, 0) molAD = (False, 0) molCI = (False, 0) if cpd['Status'] == 'DONE': molPR = (True, cpd['DGcalc']) molAD = (True, cpd['CI']) molCI = (True, cpd['Err']) doneCpd = True listPreds[i] = [doneCpd, [molPR, molAD, molCI]] # Clean single job cpdFileNm = cpd['JobName'] listScreens = jobHandler.jobsWithCpd(cpdFileNm) # if more than one: stop if len(listScreens) == 1: # Update screening job cpdJob = jobHandler.jobCpd() cpdJob.load(cpdFileNm) #clean mol folder and index cpdJob.remove() jobScreen.remove() completed = True except Exception, e: print e
def extractJob(jobId, tmpFolder, modelFolder, outPref=''): extFile = os.path.splitext(jobId)[1] if extFile == '.dsr': dsr = True else: dsr = False jobFn = os.path.join(tmpFolder, jobId) if not os.path.exists(jobFn): logging.error('JOB ID %s NOT FOUND!!!' % jobFn) sys.exit() job = jobHandler.jobCpd() job.load(jobFn) if job.prediction: typeJob = 'PREDICTION' else: typeJob = 'CALIBRATION' if dsr: ### Write dataset information datasetInfo = True if typeJob == 'CALIBRATION': # LOAD ALL THE INFO ABOUT CALIBRATION JOB if job.status == 'DONE': completed, trainModel = loadTrainInfo('trainSet', job.modelProt, job.modelProtVer) if not completed: datasetInfo = False logging.error(trainModel) else: datasetData, listPoses = getFullTrain( trainModel, job.results) else: # LOAD ALL THE INFO ABOUT PREDICTION JOB datasetData, listPoses = getFullPred(job.sdf, job.results) if job.status == 'DONE': stats = getStats(datasetData, modelFolder, job.modelProt, job.modelProtVer) if len(listPoses) > 0: dirNmPoses = 'Poses_%s' % jobId extractPoses(listPoses, dirNmPoses) ## WRITE OUT INFORMATION wb = xlsxwriter.Workbook('Info_%s.xlsx' % jobId) boldFmt = wb.add_format({'bold': True}) ws = wb.add_worksheet() ws.write(0, 0, 'Filename', boldFmt) ws.write(0, 1, job.filename) ws.write(0, 2, 'Type', boldFmt) ws.write(0, 3, typeJob) ws.write(0, 4, 'Status', boldFmt) ws.write(0, 5, job.status) ws.write(1, 0, 'Model Id', boldFmt) ws.write(1, 1, job.modelProt) ws.write(1, 2, 'Model Version', boldFmt) ws.write(1, 3, job.modelProtVer) ws.write(2, 0, 'Data Creation', boldFmt) ws.write(2, 1, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(job.dateSub))) if datasetInfo: ws.write(4, 0, 'STATISTICS', boldFmt) ws.write(5, 0, 'n cpds', boldFmt) ws.write(5, 1, 'SDEP', boldFmt) ws.write(5, 2, 'SSreg', boldFmt) ws.write(5, 3, 'SSy', boldFmt) ws.write(5, 4, 'q2', boldFmt) ws.write(5, 5, 'SSy noInt', boldFmt) ws.write(5, 6, 'q2 noInt', boldFmt) #### if job.status == 'DONE': ws.write(6, 0, stats['ncpd']) ws.write(6, 1, stats['RMSE']) ws.write(6, 2, stats['ssreg']) ws.write(6, 3, stats['ssY']) ws.write(6, 4, stats['r2']) ws.write(6, 5, stats['ssYnoint']) ws.write(6, 6, stats['r2noint']) ws.write(8, 0, 'LIST COMPOUNDS', boldFmt) ### stats['ssYnoint'] ws.write(9, 0, 'n.', boldFmt) ws.write(9, 1, 'smi', boldFmt) ws.write(9, 2, 'experimental', boldFmt) ws.write(9, 3, 'calculated', boldFmt) ws.write(9, 4, 'file', boldFmt) ws.write(9, 5, 'status', boldFmt) ws.write(9, 6, 'simulations ID', boldFmt) ws.write(9, 7, 'simulations Wi', boldFmt) ws.write(9, 8, 'CI:', boldFmt) ws.write(9, 9, 'Y range', boldFmt) ws.write(9, 10, 'Tanimoto', boldFmt) ws.write(9, 11, 'Delta Ene', boldFmt) ws.write(9, 12, 'decomp Coul', boldFmt) ws.write(9, 13, 'decomp L-J', boldFmt) row0 = 10 row = 10 for cpd in datasetData: ws.write(row, 0, cpd['id']) ws.write(row, 1, cpd['smi']) ws.write(row, 2, cpd['DGexp']) ws.write(row, 3, cpd['DGcalc']) ws.write(row, 4, cpd['JobFile']) ws.write(row, 5, cpd['Status']) ws.write(row, 6, cpd['idSims']) ws.write(row, 7, cpd['wi']) ws.write( row, 8, cpd['CI_Yrange'] + cpd['CI_Tanimoto'] + cpd['CI_Dene'] + cpd['CI_decEle'] + cpd['CI_decVdw']) ws.write(row, 9, cpd['CI_Yrange']) ws.write(row, 10, cpd['CI_Tanimoto']) ws.write(row, 11, cpd['CI_Dene']) ws.write(row, 12, cpd['CI_decEle']) ws.write(row, 13, cpd['CI_decVdw']) row += 1 wb.close() sdfOutFn = 'Input_%s.sdf' % jobId with open(sdfOutFn, 'w') as sdfOut: sdfOut.write(job.sdf)
def getFullPred(strSdf, results): #for cpd: {'Status', 'JobFile', 'DGexp', 'DGcalc', 'wi', 'smi', 'idSims', 'id'} ## Try to get DGexp from sdf sdfMols = [] molsdf = '' for line in strSdf.splitlines(): molsdf = '%s%s\n' % (molsdf, line) if line == '$$$$': sdfMols.append(molsdf) molsdf = '' pbDS = [] listkeys = [] for nm, mol in enumerate(sdfMols): pbMol = pb.readstring('sdf', str(mol)) pbDS.append(pbMol) listkeys = listkeys + pbMol.data.keys() lenDS = len(pbDS) listkeys = [x for x in set(listkeys) if listkeys.count(x) == lenDS] print '\n\n ATTRIBUTES FOUND IN THE SUBMITTED SDF FILE:\n\n' for i, item in enumerate(listkeys): print('%d.\t\t\t%s' % (i + 1, item)) while True: keyId = raw_input( '\nSelect the field with experimental DGbind [0 is not present]?') try: keyId = int(keyId) if keyId >= 0: if keyId <= len(listkeys): break raise exception except: print '\t Invalid choice %s' % str(keyId) if keyId > 0: expList = [{ 'smi': x.write('smi').split()[0], 'DGexp': x.data[listkeys[keyId - 1]] } for x in pbDS] else: expList = [{ 'smi': x.write('smi').split()[0], 'DGexp': 0 } for x in pbDS] ##### EXPERIMENTS LOADED, BEGIN TO RESUME DATA datasetInfo = [] listPoses = [] i = 0 for cpd in results: i += 1 cpdInfo = {} TSfound = False for j in range(len(expList)): if cpd['smi'] == expList[j]['smi']: TSfound = True break cpdInfo['id'] = i cpdInfo['smi'] = cpd['smi'] try: if not TSfound: raise exception cpdInfo['DGexp'] = float(expList[j]['DGexp']) except: cpdInfo['DGexp'] = '' cpdInfo['JobFile'] = cpd['JobName'] cpdInfo['Status'] = cpd['Status'] try: cpdInfo['DGcalc'] = float(cpd['DGcalc']) except: cpdInfo['DGcalc'] = '' try: wi = ['%.3f' % x for x in cpd['wi']] cpdInfo['wi'] = ', '.join(wi) except: cpdInfo['wi'] = '' try: idposes = ['%02d' % x for x in cpd['idpose']] cpdInfo['idSims'] = ', '.join(idposes) except: cpdInfo['idSims'] = '' try: cpdInfo['CI_decEle'] = cpd['CI_analysis']['decEle'] cpdInfo['CI_decVdw'] = cpd['CI_analysis']['decVdw'] cpdInfo['CI_Tanimoto'] = cpd['CI_analysis']['Tanimoto'] cpdInfo['CI_Dene'] = cpd['CI_analysis']['Dene'] cpdInfo['CI_Yrange'] = cpd['CI_analysis']['Yrange'] except: cpdInfo['CI_decEle'] = '' cpdInfo['CI_decVdw'] = '' cpdInfo['CI_Tanimoto'] = '' cpdInfo['CI_Dene'] = '' cpdInfo['CI_Yrange'] = '' datasetInfo.append(cpdInfo) ### Try to extract starting pose files try: jobcpd = jobHandler.jobCpd() jobcpd.load(cpdInfo['JobFile']) completed, cpdPoses = getPoses(jobcpd.dirTemp) if completed: listPoses.append({'cpd': i, 'poses': cpdPoses}) except Exception, e: logging.debug('Error in loading poses files, %s' % e)
def submitScreen(sdfFn, modelId, prediction, etoxlie_folder='.', fieldExp='Activity', misc=None, jobid=None): # sdfFn: filename sdf # modelId: {'modelProt': 1A2; 'modelProtVer':1} # prediction: True or False # in case of prediction=False: misc={'isGamma':True|False,'fixBeta':True|False} # 1. check sdf status = 'SUBMITTED' if os.path.exists(sdfFn): with open(sdfFn, 'r') as infile: sdf = infile.read() else: results = "File not found: %s" % sdfFn status = 'FAILED' sdf = '' # 2 Checking consistency of work and models if status != 'FAILED': # A. check other runs, in case start listJobs = jobHandler.collectJobs(cpd=False, sort=True) for job in listJobs: if job.status != 'FAILED': if sdf == job.sdf: if modelId['modelProt'] == job.modelProt: if modelId['modelProtVer'] == job.modelProtVer: if prediction == job.prediction: if not prediction: if misc == job.experiment: status = 'FAILED' results = 'SDF screen is already in process (jobID: %s)' % job.filename if status != 'FAILED': # B. check model in case quit success, results = loadModel(os.path.join(modelDir, modelId['modelProt']), prediction=prediction, verModel=modelId['modelProtVer']) if success: if prediction: model, params = results else: model = results else: results = results status = 'FAILED' # 3 Create job if status != 'FAILED': jobSDR = jobHandler.jobCpd() jobSDR.create(sdf, modelId['modelProt'], modelId['modelProtVer'], prediction=prediction, jobid=jobid) jobSDR.sdf = sdf jobSDR.experiment = misc # 4 Now submit jobs (if already submitted, link to the existing if status != 'FAILED': success, results = submitCPD(sdf, modelId['modelProt'], modelId['modelProtVer'], prediction, fieldExp) if success: jobSDR.results = results else: jobSDR.status = 'FAILED' jobSDR.results = results jobSDR.update() if status == 'FAILED': return (False, results) else: return (True, jobSDR.filename)
def submitCPD(strSdf, modelProt, modelProtVer, prediction, fieldExp='', jobid=None): # From a sdf submitted as string, lunch or recover jobs results = [] logging.debug('Start submitting compounds simulations') try: listCpds = jobHandler.collectJobs(etoxlie_folder, cpd=True, sort=True) #create list of pybel molecules #Workaround to read multiple molecules from string (not default in pybel) '''OBMol=ob.OBMol() conv = ob.OBConversion() conv.SetInAndOutFormats("sdf", "sdf") conv.ReadString(OBMol, strSdf) sdfMols=[] sdfMols.append(pb.Molecule(ob.OBMol(OBMol))) while conv.Read(OBMol): sdfMols.append(pb.Molecule(ob.OBMol(OBMol)))''' sdfMols = [] molsdf = '' for line in strSdf.splitlines(): molsdf = '%s%s\n' % (molsdf, line) if line == '$$$$': sdfMols.append(molsdf) molsdf = '' for nm, mol in enumerate(sdfMols): pbMol = pb.readstring('sdf', mol) molDict = {} if not prediction: dg = pbMol.data[fieldExp] else: dg = None sdf = mol smi = pbMol.write('smi').split()[0] for runningCpd in listCpds: ## Check if already submitted = same sdf and protein model if sdf == runningCpd.sdf and modelProt == runningCpd.modelProt: molDict['JobName'] = runningCpd.filename molDict['Status'] = 'SUBMITTED' #runningCpd.status #molDict['Results']=runningCpd.results logging.debug( "job for molecule %d:%s already processed id %s" % (nm, smi, runningCpd.filename)) if not bool(molDict): newJob = jobHandler.jobCpd() newJob.create(sdf, modelProt, modelProtVer, jobid=jobid) molDict['JobName'] = newJob.filename molDict['Status'] = newJob.status logging.debug("job for molecule %d:%s started with id %s" % (nm, smi, newJob.filename)) molDict['smi'] = smi molDict['DGexp'] = dg results.append(molDict) except Exception, e: return (False, e)
def getEnergies(listCpds): '''From list of compounds of the job, get: ene,dec,Y,fp ene is list of simulations. for simulation [ncpd, index, vdw, ele] decvdw and decene are list of compounds. for each compounds list of simulations. for simulation [ncpd, index, vdwres1, vdwres2, ...] ''' logging.debug("Gathering energies for the list of compounds") ene = [] decVdw = [] decEle = [] Y = [] smi = [] idcpd = [] jobCpd = jobHandler.jobCpd() for ncpd, cpd in enumerate(listCpds): # Exclude FAILED jobs if cpd['Status'] == 'DONE': logging.debug("Loading data for compound: %s" % cpd['JobName']) # 1. Get Y if cpd['DGexp'] is not None: Y.append(float(cpd['DGexp'])) else: Y.append(None) # 2. Get fingerprint smi.append(cpd['smi']) # 3 Load energies from job jobCpd.load(os.path.join(modelDir, cpd['JobName'])) energies = jobCpd.results energies = sorted(jobCpd.results, key=lambda k: k['index'], reverse=False) # GET ENERGIES listContr = ['vdw', 'ele'] # 4. Get interaction energies ene0 = {} for contribute in listContr: ene0[contribute] = [ x[contribute] for x in energies if ('%03d' % x['index'])[0] == '0' ] if len(ene0[contribute]) > 1: ene0[contribute] = np.mean(ene0[contribute]) indeces = list( set([('%03d' % x['index'])[:-1] for x in energies if ('%03d' % x['index'])[0] != '0'])) for idx in indeces: enepose = [[ sim[contribute] - ene0[contribute] for contribute in listContr ] for sim in energies if ('%03d' % sim['index'])[:-1] == idx] if len(enepose) > 1: enepose = np.mean(enepose, axis=0) elif len(enepose) == 1: enepose = enepose[0] ene.append([ncpd, int(idx)] + list(enepose)) # 5. Get decomposed interaction energies deceneDict = {} for contribute in listContr: contribute = 'dec_%s' % contribute deceneDict[contribute] = [] for idx in indeces: decpose = {(int(k) if k != 'rest' else str(k)): [] for k in sim[contribute] for sim in energies if ('%03d' % sim['index'])[:-1] == idx} for sim in energies: if ('%03d' % sim['index'])[:-1] == idx: for k, v in sim[contribute].items(): try: k = int(k) except Exception, e: k = str(k) decpose[k].append(v) decpose = {k: np.mean(v) for k, v in decpose.items()} deceneDict[contribute].append( [ncpd, int(idx)] + [decpose[x] for x in sorted(decpose)]) #if ('%03d'%sim['index'])[0] != '0' : # print ('%03d'%x['index'])[0] # simdec={( int(k) if k != 'rest' else str(k)):float(v) for k,v in sim[contribute].items()} # print simdec # deceneDict[contribute].append( [ncpd,sim['index']]+[simdec[x] for x in sorted(simdec)] ) decVdw.append(deceneDict['dec_vdw']) decEle.append(deceneDict['dec_ele']) idcpd.append(ncpd)
def screenSDF(listCpds, prediction, modelProt, modelProtVer, misc): # Check all the processes of a sdf screening, update status job and status single cpds runs in results updated = False status = 'SUBMITTED' try: completed = True for nc, cpd in enumerate(listCpds): # Check status single cpd (only if not already finished) if cpd['Status'] not in statEndJob: completed = False jobCpd = jobHandler.jobCpd() jobCpd.load(os.path.join(modelDir, cpd['JobName'])) #Update progress if cpd['Status'] != jobCpd.status: updated = True listCpds[nc]['Status'] = jobCpd.status listCpds[nc]['Results'] = jobCpd.results if jobCpd.status == 'DONE': if prediction: ### PREDICT COMPOUND AND UPDATE RESULTS WITH PREDICTION #here function for pred and AD. Results is a list of dictionaries containing energies for each simulation data = getEnergies([listCpds[nc]]) success, results = loadModel(os.path.join( modelDir, modelProt), prediction=True, verModel=modelProtVer) if not success: print results listCpds[nc]['Status'] = 'FAILED' #raise Exception, results continue else: modelLIE, params = results try: Gcalc, idposes, wi, sdep, sspred = predictLie( data, params['LIE']) listCpds[nc] listCpds[nc]['DGexp'] = None listCpds[nc]['DGcalc'] = Gcalc[0] listCpds[nc]['idpose'] = idposes[0] listCpds[nc]['wi'] = wi[0] CI = predictAD(data, { 'DGcalc': Gcalc, 'idposes': idposes, 'wi': wi }, params['AD']) listCpds[nc]['CI_analysis'] = CI[0] listCpds[nc]['CI'] = sum(CI[0].values()) listCpds[nc]['Err'] = predictError( sum(CI[0].values()), params['LIE']['sdep']) logging.info('RESULTS:') for item in listCpds[nc]: logging.info("{0} {1}".format( item, listCpds[nc][item])) logging.info('END RESULTS') except Exception, e: logging.error( 'Error in computing energy for job %s: %s' % (cpd['JobName'], e)) listCpds[nc]['Status'] = 'FAILED' continue if completed: logging.info('Job completed') if not prediction: # check type of calibration for backward compatibility: if misc == None: misc = {'isGamma': False, 'fixBeta': False} # Here collect energies print 'GET ENERGIES' data = getEnergies( listCpds ) # results is a dictionary with {ids, Y, ene, smi, decVdw, decEle} logging.info( 'Starting calibration for protein %s, version %s' % (modelProt, modelProtVer)) # Here create model print 'CALIBRATE LIE' LIEmodel = calibrateLie(data, gamma=misc['isGamma'], fixBeta=misc['fixBeta']) print 'CALIBRATE AD' ADmodel = calibrateAD(data, LIEmodel) modelId = { 'modelProt': modelProt, 'modelProtVer': '%04d' % modelProtVer } print 'SAVE MODEL' success, out = saveModel(modelDir, modelId, LIEmodel, ADmodel) if not success: status = 'FAILED' raise Exception, out logging.info('New model saved on file %s' % out) deltaId = 0 for n, id in enumerate(data['ids']): if listCpds[id + deltaId]['Status'] == 'DONE': listCpds[id + deltaId]['DGcalc'] = LIEmodel['Gcalc'][n] listCpds[id + deltaId]['idpose'] = LIEmodel['idx'][n] listCpds[id + deltaId]['wi'] = LIEmodel['wi'][n] else: deltaId += 1 status = 'DONE' updated = True results = listCpds