def getSMRTCellInfoFromFullPath(fullPath): cellPath = os.path.dirname(os.path.dirname(fullPath)) primaryFolder = os.path.basename(os.path.dirname(fullPath)) context = SecondaryJobService.getCellContext(os.path.basename(fullPath)) limsCode = LIMSMapper.limsCodeFromCellPath(cellPath) return {'SMRTCellPath' : cellPath, 'PrimaryFolder' : primaryFolder, 'Context' : context, 'LIMSCode' : limsCode}
def _getSMRTCellInfo(self, jobID): inputs = self._getJobInputs(jobID) inputData = [] for i in inputs: inputDict = {'SMRTCellPath' : i.get('collectionPathUri'), 'PrimaryFolder' : i.get('primaryResultsFolder'), 'Context' : 'none', 'LIMSCode' : LIMSMapper.limsCodeFromCellPath(i.get('collectionPathUri')) } inputData.append(inputDict) return inputData
def createSecondaryJobs(): secondaryJobObjects = {} # Do it by condition conditions = n.unique(csv["Name"]) newJobDefs = {} for cond in conditions: condRows = csv[csv["Name"] == cond] # If this is a new secondary job, populate the necessary database tables if csvType == "newJob": uniqueJobs = n.unique( zip( condRows["SecondaryServerName"], condRows["SecondaryProtocol"], condRows["SecondaryReference"], ) ) for job in uniqueJobs: msg = "Creating SecondaryJob for job info: %s" % str(job) MU.logMsg(classString, msg, "info") # First make the job, but don't save it serverName = job[0] protocol = job[1] reference = job[2] secondaryServer = SecondaryAnalysisServer.objects.get(serverName=serverName) sjs = SecondaryJobServiceFactory.create(secondaryServer) # protocolEntry = sjs.getModelProtocolInfo(job[1]) # referenceEntry = sjs.getModelReferenceInfo(job[2]) # # jobDef = {'protocol' : simplejson.dumps(protocolEntry), # 'reference' : simplejson.dumps(referenceEntry), # 'server' : secondaryServer} jobDef = {"protocol": protocol, "reference": reference, "server": secondaryServer} # Now add the cells jobRows = condRows[ (condRows["SecondaryServerName"] == serverName) & (condRows["SecondaryProtocol"] == protocol) & (condRows["SecondaryReference"] == reference) ] jobCells = n.unique(zip(jobRows["SMRTCellPath"], jobRows["PrimaryFolder"])) smrtCells = [] for c in jobCells: path, primaryFolder = tuple(c) msg = "Creating or accessing SMRTCell for data path: %s" % os.path.join(path, primaryFolder) MU.logMsg(classString, msg, "info") if os.path.exists(path): # This is a data path limsCode = LIMSMapper.limsCodeFromCellPath(path) cell = SMRTCell.objects.get_or_create( path=path, primaryFolder=primaryFolder, limsCode=limsCode ) smrtCells.append(cell[0]) else: # this is a LIMS Code dataPath = LIMSMapper.cellPathFromLimsCode(path) cell = SMRTCell.objects.get_or_create( path=dataPath, primaryFolder=primaryFolder, limsCode=path ) smrtCells.append(cell[0]) # Add the SMRT Cells jobDef["cells"] = smrtCells hasJob = False for pk, jd in newJobDefs.iteritems(): if jobDef == jd: hasJob = True jobObj = SecondaryJob.objects.get(id=pk) if not hasJob: cells = jobDef.pop("cells") jobObj = SecondaryJob(**jobDef) jobObj.save() jobObj.cells.add(*cells) jobDef["cells"] = cells newJobDefs[jobObj.id] = jobDef msg = "Successfully created and saved SecondaryJob: %s" % str(model_to_dict(jobObj)) MU.logMsg(classString, msg, "info") # Link secondary job to condition if not secondaryJobObjects.has_key(cond): secondaryJobObjects[cond] = [jobObj] else: secondaryJobObjects[cond].append(jobObj) else: # Job already exists for job, serverName in zip(condRows["SecondaryJobID"], condRows["SecondaryServerName"]): server = SecondaryAnalysisServer.objects.get(serverName=serverName) newJob, created = SecondaryJob.objects.get_or_create(jobID=job, server=server) # Add other job info in here if job was newly created... if True: # created: sjs = SecondaryJobServiceFactory.create(server) jobID = newJob.jobID jobInfo = sjs.getModelJobInfo(jobID) # Add protocol and reference info # protocol = SecondaryJobService.getSingleItem(jobInfo.get('protocol', {'name' : 'unknown'})) # reference = SecondaryJobService.getSingleItem(jobInfo.get('reference', {'name' : 'unknown'})) # newJob.protocol = simplejson.dumps(protocol) # newJob.reference = simplejson.dumps(reference) newJob.protocol = jobInfo.get("protocol", "unknown") newJob.reference = jobInfo.get("reference", "unknown") newJob.save() # Get the SMRT Cells smrtCells = jobInfo.get("inputs", []) smrtCellObjs = [] for c in smrtCells: cell = SMRTCell.objects.get_or_create( path=c["SMRTCellPath"], primaryFolder=c["PrimaryFolder"], limsCode=c["LIMSCode"] ) smrtCellObjs.append(cell[0]) # Now add the SMRT Cells to the new job [newJob.cells.add(x) for x in smrtCellObjs] # Link secondary job to condition if not secondaryJobObjects.has_key(cond): secondaryJobObjects[cond] = [newJob] else: secondaryJobObjects[cond].append(newJob) return secondaryJobObjects
def validateDefinition(self): csv = self.csv allHeaders = self.getValidHeaderValues() minHeaders = self.getValidHeaderValues(True) csvHeaders = filter(lambda x: not x.startswith('p_'), csv.dtype.names) if csv is None: msg = '[%s] is not a valid file name or stream' % str(self.csv) return (False, msg) # Check to see that valid headers were supplied jobType = self.getDefinitionType() if not jobType: msg = 'Invalid headers supplied in input csv: %s' % str(csvHeaders) return (False, msg) # Check if csvFN can be parsed if not csv.shape: msg = 'Invalid CSV file name or stream: file does not exist or stream can not be opened' return (False, msg) # Check for unpopulated default columns minColnames = minHeaders.get(jobType) wrngclmns = filter(lambda x: n.dtype(x[1]) == n.dtype(bool) and x[0] in minColnames, csv.dtype.descr) if wrngclmns: msg = 'Incorrectly formatted CSV file:\n Column(s) [%s] have not been populated' % ', '.join([c[0] for c in wrngclmns]) return (False, msg) # Check if the file contains the correct default column names if filter(lambda x: x not in csvHeaders, minColnames): msg = 'Incorrectly formatted CSV file:\n Missing default column names from %s' % str(minColnames) return (False, msg) # Check to ensure SecondaryServerName maps to valid server try: secondaryServers = [SecondaryAnalysisServer.objects.get(serverName=x) for x in csv['SecondaryServerName']] secondaryServers = dict((x.serverName, x) for x in secondaryServers) serverNames = n.unique(secondaryServers.keys()) dataHandlerDict = dict([(s, SecondaryJobServiceFactory.create(secondaryServers.get(s))) for s in serverNames]) except ObjectDoesNotExist: msg = 'Invalid SecondaryServerName. Valid values are: %s' % (', '.join([x.serverName for x in SecondaryAnalysisServer.objects.all()])) return (False, msg) # Check for correct naming of conditions if filter(lambda x: re.findall(r'[^A-Za-z0-9_\.\-]', str(x)), csv['Name']): msg = 'Incorrectly formatted CSV file:\n Condition names can only contain: alphanumeric characters, dashes (-), underscores (_) and dots (.)' return (False, msg) # Check if the non-default columns have a p_ prefix allColnames = allHeaders.get(jobType) extras = filter(lambda x: x not in allColnames, csv.dtype.names) if extras and not filter(lambda x: x.startswith('p_'), extras): msg = 'Incorrectly formatted CSV file:\n Extra parameters need to be named using a "p_" prefix' return (False, msg) # Check new job specific settings if jobType == 'newJob': serverPropDict = dict([(s, dataHandlerDict[s].makePropertyDict(('ReferenceNames', 'ProtocolNames'))) for s in serverNames]) # Check if protocols and references provided exists in secondary server's protocol list for s,p,r in zip(csv['SecondaryServerName'], csv['SecondaryProtocol'], csv['SecondaryReference']): serverProtocols = serverPropDict[s].get('ProtocolNames', []) serverReferences = serverPropDict[s].get('ReferenceNames', []) referenceWhitelist = ['LIMSTemplates', 'LIMSTemplate'] if len(filter(lambda x: x==p, serverProtocols)) != 1: msg = 'SecondaryProtocol does not map to a single name on server: %s' % p return (False, msg) if len(filter(lambda x: x==r, serverReferences)) != 1 and r not in referenceWhitelist: msg = 'SecondaryReference does not map to a single name on server: %s' % r return (False, msg) # Check whether primary folder names are contained within the given run codes and pls.h5/bas.h5 files exist for s,d,p in zip(csv['SecondaryServerName'], csv['SMRTCellPath'], csv['PrimaryFolder']): if os.path.exists(d): # A data path was given, not a LIMSCode scv = SMRTCellDataPathValidator(d, p) valid, msg = scv.isValid() if not valid: return (False, msg) # This is for internal PacBio martin server validation elif valid and re.findall('^martin', s.lower()): limsPath = LIMSMapper.limsCodeFromCellPath(d) if not limsPath: return (False, 'Martin jobs must be given cells with a valid LIMS Code: supplied [%s]' % d) else: # Path provided might be a LIMSCode - check with the LIMSHandler to see if it validates limsPath = LIMSMapper.limsCodeFromCellPath(d) if not limsPath: return (False, 'SMRTCellPath [%s] is not a valid path, nor a valid LIMS Code' % d) else: scv = SMRTCellDataPathValidator(limsPath, p) valid, msg = scv.isValid() if not valid: return (False, msg) # Check to make sure that protocol is split/merge-able. if 'ExtractBy' in csvHeaders: for s, p, e in zip(csv['SecondaryServerName'], csv['SecondaryProtocol'], csv['ExtractBy']): sjs = dataHandlerDict.get(s) if e and not sjs.protocolIsSplittable(p): msg = 'SecondaryProtocol [%s] is not be extractable by [%s]. Does the protocol generate a cmp.h5?' % (p, e) return (False, msg) elif jobType == 'existingJob': # Check to make sure all job IDs are integers try: [int(x) for x in csv['SecondaryJobID']] except ValueError: msg = 'Invalid SecondaryJobID provided. All IDs must be integers: %s' % csv['SecondaryJobID'] return (False, msg) # Check to make sure that jobs actually exist on specified servers for s,j in zip(csv['SecondaryServerName'], csv['SecondaryJobID']): sdh = dataHandlerDict.get(s) try: sdh.singleJobExists(j) except SecondaryJobServiceError: # yes, this is bad... i know msg = 'Single Job ID [%s] does not exist on server [%s]' % (j,s) return (False, msg) # Check to make sure that protocol is split/merge-able. if 'ExtractBy' in csvHeaders: for s, j, e in zip(csv['SecondaryServerName'], csv['SecondaryJobID'], csv['ExtractBy']): sjs = dataHandlerDict.get(s) jobInfo = sjs.getModelJobInfo(j) #protocol = SecondaryJobService.getSingleItem(jobInfo.get('protocol')).get('name') protocol = jobInfo.get('protocol') if protocol and protocol != 'unknown' and e and not sjs.protocolIsSplittable(protocol): msg = 'SecondaryProtocol [%s] is not be extractable by [%s]. Does the protocol generate a cmp.h5?' % (protocol, e) return (False, msg) # COMMON TO NEW AND EXISTING JOBS # Check for uniqueness of column values within conditions for cond in n.unique(csv['Name']): condRows = csv[csv['Name'] == cond] notUnique = ['SecondaryJobID', 'SecondaryServerName', 'SMRTCellPath', 'PrimaryFolder', 'SecondaryProtocol'] if filter(lambda x: len(n.unique(condRows[x])) != 1, [k for k in condRows.dtype.names if k not in notUnique and not k.startswith('p_')]): msg = 'For condition name=%s some of the attributes are NOT unique' % cond return (False, msg) # Check to make sure that merged existing jobs all have the same reference if 'SecondaryJobID' in condRows.dtype.names: references = [] for s, j in zip(condRows['SecondaryServerName'], condRows['SecondaryJobID']): sjs = dataHandlerDict.get(s) jobInfo = sjs.getModelJobInfo(j) #reference = SecondaryJobService.getSingleItem(jobInfo.get('reference')) #refName = reference.get('name') refName = jobInfo.get('reference') if not refName in references: references.append(refName) if len(references) > 1: msg = 'Cannot merge two jobs with different reference sequences! Condition: %s' % condRows['Name'] return (False, msg) # Check to make sure that split/merge was specified correctly if 'ExtractBy' in csvHeaders: for e in csv['ExtractBy']: bOps = ['>', '<', '==', '!=', '!==', '>=', '<='] combs = ['&', '|'] opsUsed = filter(lambda x: x in e, bOps) if not opsUsed: msg = 'Illegal syntax for ExtractBy field, please use binary operator statement (e.g. Readlength > 1000)' return (False, msg) combsUsed = filter(lambda x: x in e, combs) if combsUsed: e = [x.strip() for x in re.split(r'[&\|]+', e)] else: e = [e] validExtract = ExperimentDefinitionValidator.getValidExtractByValues() if not all([any([v in x for v in validExtract]) for x in e]): msg = 'Invalid ExtractBy [%s]. Please select valid ExtractBy option: %s' % (e, ', '.join(validExtract)) return (False, msg) return (True, 'CSV file passed validation')