def findPriorResults(pairedEnd, resultsFolder, projectId, maplambda=False): '''Looks for all result files in the results folder.''' priors = {} steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: for fileToken in STEPS[step]['results'].keys(): fid = dxencode.find_file(resultsFolder + STEPS[step]['results'][fileToken], project=projectId, recurse=False) if fid != None: priors[fileToken] = fid elif maplambda and step.find('trim') > -1: ## giant kludge folder = resultsFolder.rstrip('/lambda') fid = dxencode.find_file(folder + STEPS[step]['results'][fileToken], project=projectId, recurse=False) if fid != None: priors[fileToken] = fid return priors
def checkRunsPreviouslyLaunched(resultsFolder,projectId): '''Checks for currently running jobs and will exit if found.''' launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE launchFids = dxencode.find_file(launchFilePath,projectId,multiple=True) if launchFids == None: print " No prior jobs launched." else: # NOTE: Appending to the one file, but just in case handle multiple files. for fid in launchFids: with dxpy.open_dxfile(fid) as fd: for line in fd: #print "Looking for job ["+line+"]" runId = line.split(None,1) if not runId[0].startswith('analysis-'): continue analysis = dxpy.DXAnalysis(dxid=runId[0]) if analysis == None: continue state = analysis.describe()['state'] # states I have seen: in_progress, terminated, done, failed if state not in [ "done", "failed", "terminated" ]: msg="Exiting: Can't launch because prior run ["+runId[0]+"] " if len(runId) > 1: msg+="("+runId[1]+") " msg+= "has not finished (currently '"+state+"')." print msg sys.exit(1) else: msg=" Prior run ["+runId[0]+"] " if len(runId) > 1: msg+="("+runId[1]+") " msg+= "is '"+state+"'." print msg
def checkRunsPreviouslyLaunched(resultsFolder, projectId): '''Checks for currently running jobs and will exit if found.''' launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE launchFids = dxencode.find_file(launchFilePath, projectId, multiple=True) if launchFids == None: print " No prior jobs launched." else: # NOTE: Appending to the one file, but just in case handle multiple files. for fid in launchFids: with dxpy.open_dxfile(fid) as fd: for line in fd: #print "Looking for job ["+line+"]" runId = line.split(None, 1) if not runId[0].startswith('analysis-'): continue analysis = dxpy.DXAnalysis(dxid=runId[0]) if analysis == None: continue state = analysis.describe()['state'] # states I have seen: in_progress, terminated, done, failed if state not in ["done", "failed", "terminated"]: msg = "Exiting: Can't launch because prior run [" + runId[ 0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "has not finished (currently '" + state + "')." print msg sys.exit(1) else: msg = " Prior run [" + runId[0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "is '" + state + "'." print msg
def parse_map_report(folder, project): mapreport = "/*_bismark_map_report.txt" report_link = dxencode.find_file(folder + mapreport, project.get_id(), recurse=False) metrics = {} res = {} for lab in labels: res[lab] = re.compile("(%s):\s+(.+)" % lab) try: with dxpy.open_dxfile(report_link) as rfd: for line in rfd: m = False for metric in res.values(): m = metric.match(line) if m: metrics.update({m.group(1): m.group(2).strip()}) continue except Exception, e: print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % ( folder, report_link, e)
def get_fastqc(accession, project): summary_fn = accession+"_summary.txt" report_fn = accession+"_data.txt" summary_link = dxencode.find_file(summary_fn, project.get_id()) report_link = dxencode.find_file(report_fn, project.get_id()) metrics = {} try: with dxpy.open_dxfile(report_link) as rfd: total = re.compile('Total Sequences\s+(\d+)') for line in rfd: m = total.match(line) if m: metrics.update({ 'Total Sequences': m.group(1) }) except Exception, e: print "ERROR: Could not read FastQC summary: %s (%s) \n%s" % (summary_fn, summary_link, e) metrics.update({'Total Sequences': -999.999 })
def findReferenceFiles(refs, priors,refLoc,extras): '''Locates all reference files based upon gender, genome and annotation.''' #TODO move to module? Have to determine dx file structure. refLoc=refLoc+'/'+extras['genome'] for ref in refs: dxfile = refLoc+'/'+GENOME_REFERENCES[ref][extras['genome']][extras['gender']] fid = dxencode.find_file(dxfile,REF_PROJECT_DEFAULT) if fid == None: sys.exit("ERROR: Unable to locate DNA Methylation ref file: '" + dxfile + "'") else: priors[ref] = fid
def findPriorResults(pairedEnd,resultsFolder,projectId, maplambda=False): '''Looks for all result files in the results folder.''' priors = {} steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: for fileToken in STEPS[step]['results'].keys(): fid = dxencode.find_file(resultsFolder + STEPS[step]['results'][fileToken],project=projectId, recurse=False) if fid != None: priors[fileToken] = fid elif maplambda and step.find('trim') > -1: ## giant kludge folder = resultsFolder.rstrip('/lambda') fid = dxencode.find_file(folder + STEPS[step]['results'][fileToken],project=projectId, recurse=False) if fid != None: priors[fileToken] = fid return priors
def findReferenceFiles(refs, priors, refLoc, extras): '''Locates all reference files based upon gender, genome and annotation.''' #TODO move to module? Have to determine dx file structure. refLoc = refLoc + '/' + extras['genome'] for ref in refs: dxfile = refLoc + '/' + GENOME_REFERENCES[ref][extras['genome']][ extras['gender']] fid = dxencode.find_file(dxfile, REF_PROJECT_DEFAULT) if fid == None: sys.exit("ERROR: Unable to locate DNA Methylation ref file: '" + dxfile + "'") else: priors[ref] = fid
def logThisRun(runId,resultsFolder,projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath,projectId) newFh = dxpy.new_dxfile('a',project=projectId,folder=resultsFolder,name=RUNS_LAUNCHED_FILE) newFh.write(runId+' started:'+str(datetime.now())+'\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId+'\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def logThisRun(runId, resultsFolder, projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath, projectId) newFh = dxpy.new_dxfile('a', project=projectId, folder=resultsFolder, name=RUNS_LAUNCHED_FILE) newFh.write(runId + ' started:' + str(datetime.now()) + '\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId + '\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def parse_map_report(folder, project): mapreport = "/*_bismark_map_report.txt" report_link = dxencode.find_file(folder + mapreport, project.get_id(), recurse=False) metrics = {} res = {} for lab in labels: res[lab] = re.compile("(%s):\s+(.+)" % lab) try: with dxpy.open_dxfile(report_link) as rfd: for line in rfd: m = False for metric in res.values(): m = metric.match(line) if m: metrics.update({m.group(1): m.group(2).strip()}) continue except Exception, e: print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % (folder, report_link, e)