Beispiel #1
0
 def submitSecondaryJobs(self, resubmit=False):
     uniqueJobs = set(chain.from_iterable(self.secondaryJobs.values()))
     for job in uniqueJobs:
         sjs = SecondaryJobServiceFactory.create(job.server)
         if resubmit and job.jobID:
             sjs.resubmitSingleJob(job)
         else:
             sjs.submitSingleJob(job, self.project.projectID)
Beispiel #2
0
        def createSecondaryJobs():

            secondaryJobObjects = {}

            # Do it by condition
            conditions = n.unique(csv["Name"])
            newJobDefs = {}
            for cond in conditions:
                condRows = csv[csv["Name"] == cond]

                # If this is a new secondary job, populate the necessary database tables
                if csvType == "newJob":

                    uniqueJobs = n.unique(
                        zip(
                            condRows["SecondaryServerName"],
                            condRows["SecondaryProtocol"],
                            condRows["SecondaryReference"],
                        )
                    )

                    for job in uniqueJobs:
                        msg = "Creating SecondaryJob for job info: %s" % str(job)
                        MU.logMsg(classString, msg, "info")

                        # First make the job, but don't save it
                        serverName = job[0]
                        protocol = job[1]
                        reference = job[2]

                        secondaryServer = SecondaryAnalysisServer.objects.get(serverName=serverName)
                        sjs = SecondaryJobServiceFactory.create(secondaryServer)

                        #                        protocolEntry  = sjs.getModelProtocolInfo(job[1])
                        #                        referenceEntry = sjs.getModelReferenceInfo(job[2])
                        #
                        #                        jobDef = {'protocol'  : simplejson.dumps(protocolEntry),
                        #                                  'reference' : simplejson.dumps(referenceEntry),
                        #                                  'server'    : secondaryServer}
                        jobDef = {"protocol": protocol, "reference": reference, "server": secondaryServer}

                        # Now add the cells
                        jobRows = condRows[
                            (condRows["SecondaryServerName"] == serverName)
                            & (condRows["SecondaryProtocol"] == protocol)
                            & (condRows["SecondaryReference"] == reference)
                        ]

                        jobCells = n.unique(zip(jobRows["SMRTCellPath"], jobRows["PrimaryFolder"]))
                        smrtCells = []
                        for c in jobCells:
                            path, primaryFolder = tuple(c)
                            msg = "Creating or accessing SMRTCell for data path: %s" % os.path.join(path, primaryFolder)
                            MU.logMsg(classString, msg, "info")
                            if os.path.exists(path):
                                # This is a data path
                                limsCode = LIMSMapper.limsCodeFromCellPath(path)
                                cell = SMRTCell.objects.get_or_create(
                                    path=path, primaryFolder=primaryFolder, limsCode=limsCode
                                )
                                smrtCells.append(cell[0])
                            else:
                                # this is a LIMS Code
                                dataPath = LIMSMapper.cellPathFromLimsCode(path)
                                cell = SMRTCell.objects.get_or_create(
                                    path=dataPath, primaryFolder=primaryFolder, limsCode=path
                                )
                                smrtCells.append(cell[0])

                        # Add the SMRT Cells
                        jobDef["cells"] = smrtCells
                        hasJob = False
                        for pk, jd in newJobDefs.iteritems():
                            if jobDef == jd:
                                hasJob = True
                                jobObj = SecondaryJob.objects.get(id=pk)

                        if not hasJob:
                            cells = jobDef.pop("cells")
                            jobObj = SecondaryJob(**jobDef)
                            jobObj.save()
                            jobObj.cells.add(*cells)
                            jobDef["cells"] = cells
                            newJobDefs[jobObj.id] = jobDef

                        msg = "Successfully created and saved SecondaryJob: %s" % str(model_to_dict(jobObj))
                        MU.logMsg(classString, msg, "info")

                        # Link secondary job to condition
                        if not secondaryJobObjects.has_key(cond):
                            secondaryJobObjects[cond] = [jobObj]
                        else:
                            secondaryJobObjects[cond].append(jobObj)

                else:
                    # Job already exists
                    for job, serverName in zip(condRows["SecondaryJobID"], condRows["SecondaryServerName"]):
                        server = SecondaryAnalysisServer.objects.get(serverName=serverName)
                        newJob, created = SecondaryJob.objects.get_or_create(jobID=job, server=server)

                        # Add other job info in here if job was newly created...
                        if True:  # created:
                            sjs = SecondaryJobServiceFactory.create(server)
                            jobID = newJob.jobID
                            jobInfo = sjs.getModelJobInfo(jobID)

                            # Add protocol and reference info
                            #                            protocol  = SecondaryJobService.getSingleItem(jobInfo.get('protocol', {'name' : 'unknown'}))
                            #                            reference = SecondaryJobService.getSingleItem(jobInfo.get('reference', {'name' : 'unknown'}))
                            #                            newJob.protocol  = simplejson.dumps(protocol)
                            #                            newJob.reference = simplejson.dumps(reference)
                            newJob.protocol = jobInfo.get("protocol", "unknown")
                            newJob.reference = jobInfo.get("reference", "unknown")

                            newJob.save()

                            # Get the SMRT Cells
                            smrtCells = jobInfo.get("inputs", [])
                            smrtCellObjs = []
                            for c in smrtCells:
                                cell = SMRTCell.objects.get_or_create(
                                    path=c["SMRTCellPath"], primaryFolder=c["PrimaryFolder"], limsCode=c["LIMSCode"]
                                )
                                smrtCellObjs.append(cell[0])

                            # Now add the SMRT Cells to the new job
                            [newJob.cells.add(x) for x in smrtCellObjs]

                        # Link secondary job to condition
                        if not secondaryJobObjects.has_key(cond):
                            secondaryJobObjects[cond] = [newJob]
                        else:
                            secondaryJobObjects[cond].append(newJob)

            return secondaryJobObjects
Beispiel #3
0
    def validateDefinition(self):
        csv = self.csv
        allHeaders = self.getValidHeaderValues()
        minHeaders = self.getValidHeaderValues(True)
        csvHeaders = filter(lambda x: not x.startswith('p_'), csv.dtype.names)
        
        if csv is None:
            msg = '[%s] is not a valid file name or stream' % str(self.csv)
            return (False, msg)
        
        # Check to see that valid headers were supplied
        jobType = self.getDefinitionType()    
        if not jobType:
            msg = 'Invalid headers supplied in input csv: %s' % str(csvHeaders)
            return (False, msg)
                        
        # Check if csvFN can be parsed
        if not csv.shape:
            msg = 'Invalid CSV file name or stream: file does not exist or stream can not be opened'
            return (False, msg)
                
        # Check for unpopulated default columns
        minColnames = minHeaders.get(jobType)
        wrngclmns = filter(lambda x: n.dtype(x[1]) == n.dtype(bool) and x[0] in minColnames, csv.dtype.descr)
        if wrngclmns: 
            msg = 'Incorrectly formatted CSV file:\n Column(s) [%s] have not been populated' % ', '.join([c[0] for c in wrngclmns])
            return (False, msg)

        # Check if the file contains the correct default column names
        if filter(lambda x: x not in csvHeaders, minColnames):
            msg = 'Incorrectly formatted CSV file:\n Missing default column names from %s' % str(minColnames)
            return (False, msg)

        # Check to ensure SecondaryServerName maps to valid server
        try:
            secondaryServers = [SecondaryAnalysisServer.objects.get(serverName=x) for x in csv['SecondaryServerName']]
            secondaryServers = dict((x.serverName, x) for x in secondaryServers)
            serverNames = n.unique(secondaryServers.keys())
            dataHandlerDict = dict([(s, SecondaryJobServiceFactory.create(secondaryServers.get(s))) for s in serverNames])
        except ObjectDoesNotExist:
            msg = 'Invalid SecondaryServerName. Valid values are: %s' % (', '.join([x.serverName for x in SecondaryAnalysisServer.objects.all()]))
            return (False, msg)
               
        # Check for correct naming of conditions
        if filter(lambda x: re.findall(r'[^A-Za-z0-9_\.\-]', str(x)), csv['Name']):
            msg = 'Incorrectly formatted CSV file:\n Condition names can only contain: alphanumeric characters, dashes (-), underscores (_) and dots (.)'
            return (False, msg)
        
        # Check if the non-default columns have a p_ prefix
        allColnames = allHeaders.get(jobType)
        extras = filter(lambda x: x not in allColnames, csv.dtype.names)
        if extras and not filter(lambda x: x.startswith('p_'), extras):
            msg = 'Incorrectly formatted CSV file:\n Extra parameters need to be named using a "p_" prefix' 
            return (False, msg)
        
        # Check new job specific settings
        if jobType == 'newJob':
            serverPropDict = dict([(s, dataHandlerDict[s].makePropertyDict(('ReferenceNames', 'ProtocolNames'))) for s in serverNames])
            
            # Check if protocols and references provided exists in secondary server's protocol list
            for s,p,r in zip(csv['SecondaryServerName'], csv['SecondaryProtocol'], csv['SecondaryReference']):
                serverProtocols = serverPropDict[s].get('ProtocolNames', [])
                serverReferences = serverPropDict[s].get('ReferenceNames', [])
                referenceWhitelist = ['LIMSTemplates', 'LIMSTemplate']
                if len(filter(lambda x: x==p, serverProtocols)) != 1:
                    msg = 'SecondaryProtocol does not map to a single name on server: %s' % p
                    return (False, msg)
                if len(filter(lambda x: x==r, serverReferences)) != 1 and r not in referenceWhitelist:
                    msg = 'SecondaryReference does not map to a single name on server: %s' % r
                    return (False, msg)
            
            # Check whether primary folder names are contained within the given run codes and pls.h5/bas.h5 files exist
            for s,d,p in zip(csv['SecondaryServerName'], csv['SMRTCellPath'], csv['PrimaryFolder']):
                if os.path.exists(d):
                    # A data path was given, not a LIMSCode
                    scv = SMRTCellDataPathValidator(d, p)
                    valid, msg = scv.isValid()
                    if not valid:
                        return (False, msg)
                    # This is for internal PacBio martin server validation
                    elif valid and re.findall('^martin', s.lower()):
                        limsPath = LIMSMapper.limsCodeFromCellPath(d)
                        if not limsPath:
                            return (False, 'Martin jobs must be given cells with a valid LIMS Code: supplied [%s]' % d)
                else:
                    # Path provided might be a LIMSCode - check with the LIMSHandler to see if it validates
                    limsPath = LIMSMapper.limsCodeFromCellPath(d)
                    if not limsPath:
                        return (False, 'SMRTCellPath [%s] is not a valid path, nor a valid LIMS Code' % d)
                        
                    else:
                        scv = SMRTCellDataPathValidator(limsPath, p)
                        valid, msg = scv.isValid()
                        if not valid:
                            return (False, msg)
                    
            # Check to make sure that protocol is split/merge-able.
            if 'ExtractBy' in csvHeaders:
                for s, p, e in zip(csv['SecondaryServerName'], csv['SecondaryProtocol'], csv['ExtractBy']):
                    sjs = dataHandlerDict.get(s)
                    if e and not sjs.protocolIsSplittable(p):
                        msg = 'SecondaryProtocol [%s] is not be extractable by [%s].  Does the protocol generate a cmp.h5?' % (p, e)
                        return (False, msg)
                    
        
        elif jobType == 'existingJob':
            
            # Check to make sure all job IDs are integers
            try:
                [int(x) for x in csv['SecondaryJobID']]
            except ValueError:
                msg = 'Invalid SecondaryJobID provided.  All IDs must be integers: %s' % csv['SecondaryJobID']
                return (False, msg)
            
            # Check to make sure that jobs actually exist on specified servers
            for s,j in zip(csv['SecondaryServerName'], csv['SecondaryJobID']):
                sdh = dataHandlerDict.get(s)
                try:
                    sdh.singleJobExists(j)
                except SecondaryJobServiceError: # yes, this is bad... i know
                    msg = 'Single Job ID [%s] does not exist on server [%s]' % (j,s)
                    return (False, msg)
            
            # Check to make sure that protocol is split/merge-able.
            if 'ExtractBy' in csvHeaders:
                for s, j, e in zip(csv['SecondaryServerName'], csv['SecondaryJobID'], csv['ExtractBy']):
                    sjs = dataHandlerDict.get(s)
                    jobInfo = sjs.getModelJobInfo(j)
                    #protocol = SecondaryJobService.getSingleItem(jobInfo.get('protocol')).get('name')
                    protocol = jobInfo.get('protocol')
                    if protocol and protocol != 'unknown' and e and not sjs.protocolIsSplittable(protocol):
                        msg = 'SecondaryProtocol [%s] is not be extractable by [%s].  Does the protocol generate a cmp.h5?' % (protocol, e)
                        return (False, msg)
        
        # COMMON TO NEW AND EXISTING JOBS
        # Check for uniqueness of column values within conditions
        for cond in n.unique(csv['Name']):
            condRows = csv[csv['Name'] == cond]
            notUnique = ['SecondaryJobID', 'SecondaryServerName', 'SMRTCellPath', 'PrimaryFolder', 'SecondaryProtocol']
            if filter(lambda x: len(n.unique(condRows[x])) != 1, [k for k in condRows.dtype.names if k not in notUnique and not k.startswith('p_')]):
                msg = 'For condition name=%s some of the attributes are NOT unique' % cond
                return (False, msg) 
            # Check to make sure that merged existing jobs all have the same reference
            if 'SecondaryJobID' in condRows.dtype.names:
                references = []
                for s, j in zip(condRows['SecondaryServerName'], condRows['SecondaryJobID']):
                    sjs = dataHandlerDict.get(s)
                    jobInfo = sjs.getModelJobInfo(j)
                    #reference = SecondaryJobService.getSingleItem(jobInfo.get('reference'))
                    #refName = reference.get('name')
                    refName = jobInfo.get('reference')
                    if not refName in references:
                        references.append(refName)
            
                if len(references) > 1:
                    msg = 'Cannot merge two jobs with different reference sequences! Condition: %s' % condRows['Name']
                    return (False, msg)
            
        # Check to make sure that split/merge was specified correctly
        if 'ExtractBy' in csvHeaders:
            for e in csv['ExtractBy']:
                bOps = ['>', '<', '==', '!=', '!==', '>=', '<=']
                combs = ['&', '|']
                opsUsed = filter(lambda x: x in e, bOps)
                if not opsUsed:
                    msg = 'Illegal syntax for ExtractBy field, please use binary operator statement (e.g. Readlength > 1000)'
                    return (False, msg)
                combsUsed = filter(lambda x: x in e, combs)
                if combsUsed:
                    e = [x.strip() for x in re.split(r'[&\|]+', e)]
                else:
                    e = [e]
                validExtract = ExperimentDefinitionValidator.getValidExtractByValues()
                if not all([any([v in x for v in validExtract]) for x in e]):
                    msg = 'Invalid ExtractBy [%s]. Please select valid ExtractBy option: %s' % (e, ', '.join(validExtract))
                    return (False, msg)
                                    
        
        return (True, 'CSV file passed validation')
    req = handler.getModelJobInfo(jobID)
    PO('Basic Job Info', req)

if __name__ == '__main__':
    
    print "Beginning Tests..."
    
    server = PBU.MARTIN_DVLOCAL_SMRT_SERVER
    jobID = '185018'
    protocol = 'Standard_Standard'

#    server = PBU.MP17_DVLOCAL_SMRT_SERVER
#    protocol = 'RS_Resequencing.1'
#    jobID = '055861'
    
    handler = SecondaryJobServiceFactory.create(server, disk=True)
    #pingSecondaryServer(server)
    getModelJobInfo(server, handler, jobID)
    
    
#    getReferenceSequences(server, handler)
#    getReferenceEntries(server, handler)
#    getReferenceNames(server, handler)
#    getSingleReferenceEntry(server, handler, 'Cholera_2010EL_1786')
#    
#    getProtocols(server, handler)
#    getProtocolEntries(server, handler)
#    getProtocolNames(server, handler)
#    getSingleProtocolEntry(server, handler, 'NoRL_ForwardOnly')
    testProtocolFunction(server, handler, protocol)
#