def checkRunsPreviouslyLaunched(resultsFolder,projectId): '''Checks for currently running jobs and will exit if found.''' launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE launchFids = dxencode.find_file(launchFilePath,projectId,multiple=True) if launchFids == None: print " No prior jobs launched." else: # NOTE: Appending to the one file, but just in case handle multiple files. for fid in launchFids: with dxpy.open_dxfile(fid) as fd: for line in fd: #print "Looking for job ["+line+"]" runId = line.split(None,1) if not runId[0].startswith('analysis-'): continue analysis = dxpy.DXAnalysis(dxid=runId[0]) if analysis == None: continue state = analysis.describe()['state'] # states I have seen: in_progress, terminated, done, failed if state not in [ "done", "failed", "terminated" ]: msg="Exiting: Can't launch because prior run ["+runId[0]+"] " if len(runId) > 1: msg+="("+runId[1]+") " msg+= "has not finished (currently '"+state+"')." print msg sys.exit(1) else: msg=" Prior run ["+runId[0]+"] " if len(runId) > 1: msg+="("+runId[1]+") " msg+= "is '"+state+"'." print msg
def checkRunsPreviouslyLaunched(resultsFolder, projectId): '''Checks for currently running jobs and will exit if found.''' launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE launchFids = dxencode.find_file(launchFilePath, projectId, multiple=True) if launchFids == None: print " No prior jobs launched." else: # NOTE: Appending to the one file, but just in case handle multiple files. for fid in launchFids: with dxpy.open_dxfile(fid) as fd: for line in fd: #print "Looking for job ["+line+"]" runId = line.split(None, 1) if not runId[0].startswith('analysis-'): continue analysis = dxpy.DXAnalysis(dxid=runId[0]) if analysis == None: continue state = analysis.describe()['state'] # states I have seen: in_progress, terminated, done, failed if state not in ["done", "failed", "terminated"]: msg = "Exiting: Can't launch because prior run [" + runId[ 0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "has not finished (currently '" + state + "')." print msg sys.exit(1) else: msg = " Prior run [" + runId[0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "is '" + state + "'." print msg
def parse_map_report(folder, project): mapreport = "/*_bismark_map_report.txt" report_link = dxencode.find_file(folder + mapreport, project.get_id(), recurse=False) metrics = {} res = {} for lab in labels: res[lab] = re.compile("(%s):\s+(.+)" % lab) try: with dxpy.open_dxfile(report_link) as rfd: for line in rfd: m = False for metric in res.values(): m = metric.match(line) if m: metrics.update({m.group(1): m.group(2).strip()}) continue except Exception, e: print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % ( folder, report_link, e)
def test_file_context_manager(self): with dxpy.new_dxfile(mode='w') as self.dxfile: file_id = self.dxfile.get_id() self.dxfile.write("Haha") file2 = dxpy.open_dxfile(file_id) state = file2._get_state() self.assertTrue(state in ['closing', 'closed']) file2._wait_on_close() self.assertEqual(file2.describe()["size"], 4)
def test_iter_dxfile(self): dxid = "" with dxpy.new_dxfile() as self.dxfile: dxid = self.dxfile.get_id() self.dxfile.write("Line 1\nLine 2\nLine 3\n") with dxpy.open_dxfile(dxid) as same_dxfile: same_dxfile.wait_on_close() self.assertTrue(same_dxfile.closed()) lineno = 1 for line in same_dxfile: self.assertEqual(line, "Line " + str(lineno)) lineno += 1
def logThisRun(runId,resultsFolder,projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath,projectId) newFh = dxpy.new_dxfile('a',project=projectId,folder=resultsFolder,name=RUNS_LAUNCHED_FILE) newFh.write(runId+' started:'+str(datetime.now())+'\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId+'\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def get_barcode_stats(self, barcode): """ Loads the JSON in a ${barcode}_stats.json file in the DNAnexus project (usually in the qc folder). """ filename = barcode + "_stats.json" # In the call to dxpy.find_one_data_object() below, I'd normally set the # more_ok parameter to False, but this blows-up in Python 3.7 - giving me a RuntimeError. # So, I just won't set it for now. I think dxpy is still mainly a Python 2.7 library and # can break in later version of Python3. file_id = dxpy.find_one_data_object(zero_ok=False, project=self.dx_project_id, name=filename)["id"] json_data = json.loads(dxpy.open_dxfile(file_id).read()) return json_data
def logThisRun(runId, resultsFolder, projectId): '''Adds a runId to the runsLaunched file in resultsFolder.''' # NOTE: DX manual lies?! Append not possible?! Then write new/delete old launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE oldFid = dxencode.find_file(launchFilePath, projectId) newFh = dxpy.new_dxfile('a', project=projectId, folder=resultsFolder, name=RUNS_LAUNCHED_FILE) newFh.write(runId + ' started:' + str(datetime.now()) + '\n') if oldFid is not None: with dxpy.open_dxfile(oldFid) as oldFh: for oldRunId in oldFh: newFh.write(oldRunId + '\n') proj = dxpy.DXProject(projectId) proj.remove_objects([oldFid]) newFh.close()
def get_fastqc(accession, project): summary_fn = accession+"_summary.txt" report_fn = accession+"_data.txt" summary_link = dxencode.find_file(summary_fn, project.get_id()) report_link = dxencode.find_file(report_fn, project.get_id()) metrics = {} try: with dxpy.open_dxfile(report_link) as rfd: total = re.compile('Total Sequences\s+(\d+)') for line in rfd: m = total.match(line) if m: metrics.update({ 'Total Sequences': m.group(1) }) except Exception, e: print "ERROR: Could not read FastQC summary: %s (%s) \n%s" % (summary_fn, summary_link, e) metrics.update({'Total Sequences': -999.999 })
def get_run_details_json(self): """ Retrieves the JSON object for the stats in the file named run_details.json in the project specified by self.dx_project_id. Returns: JSON object of the run details. """ run_details_filename = "run_details.json" run_details_json_id = dxpy.find_one_data_object( more_ok=False, zero_ok=True, project=self.dx_project_id, name=run_details_filename)["id"] json_data = json.loads( dxpy.open_dxfile(dxid=run_details_json_id).read()) #dxpy.download_dxfile(show_progress=True,dxid=run_details_json_id,project=self.dx_project_id,filename=output_name) return json_data
def test_write_read_dxfile(self): dxid = "" with dxpy.new_dxfile() as self.dxfile: dxid = self.dxfile.get_id() self.dxfile.write(self.foo_str) with dxpy.open_dxfile(dxid) as same_dxfile: same_dxfile.wait_on_close() self.assertTrue(same_dxfile.closed()) buf = same_dxfile.read(len(self.foo_str)) self.assertEqual(self.foo_str, buf) buf = same_dxfile.read() self.assertEqual(len(buf), 0) same_dxfile.seek(1) buf = same_dxfile.read() self.assertEqual(self.foo_str[1:], buf)
def get_sample_stats_json(self, barcode=None): """ .. deprecated:: 0.1.0 GSSC has removed the sample_stats.json file since the entire folder it was in has been removed. Use :meth:`get_barcode_stats` instead. Retrieves the JSON object for the stats in the file named sample_stats.json in the project specified by self.dx_project_id. This file is located in the DNAnexus folder stage\d_qc_report. Args: barcode: `str`. The barcode for the sample. Currently, the sample_stats.json file is of the following form when there isn't a genome mapping: [{"Sample name": "AGTTCC"}, {"Sample name": "CAGATC"}, {"Sample name": "GCCAAT"}, ...}]. When there is a mapping, each dictionary has many more keys in addition to the "Sample name" one. Returns: `list` of dicts if barcode=None, otherwise a dict for the given barcode. """ sample_stats_json_filename = "sample_stats.json" sample_stats_json_id = dxpy.find_one_data_object( more_ok=False, zero_ok=False, project=self.dx_project_id, name=sample_stats_json_filename)["id"] #dxpy.download_dxfile(dxid=sample_stats_json_id,project=self.dx_project_id,filename=sample_stats_json_filename) json_data = json.loads(dxpy.open_dxfile(sample_stats_json_id).read()) if not barcode: return json_data for d in json_data: #d is a dictionary sample_barcode = d["Sample name"] if sample_barcode == barcode: return d if barcode: raise DnanexusBarcodeNotFound( "Barcode {barcode} for {library_name} not found in {sample_stats_json_filename} in project {project}." .format(barcode=barcode, library_name=self.library_name, sample_stats_json_filename=sample_stats_json_filename, project=self.dx_project_id))
def parse_map_report(folder, project): mapreport = "/*_bismark_map_report.txt" report_link = dxencode.find_file(folder + mapreport, project.get_id(), recurse=False) metrics = {} res = {} for lab in labels: res[lab] = re.compile("(%s):\s+(.+)" % lab) try: with dxpy.open_dxfile(report_link) as rfd: for line in rfd: m = False for metric in res.values(): m = metric.match(line) if m: metrics.update({m.group(1): m.group(2).strip()}) continue except Exception, e: print "ERROR: Could not read Bismark mapping report in %s (%s) \n%s" % (folder, report_link, e)
def get_alignment_summary_metrics(self, barcode): """ Parses the metrics in a ${barcode}alignment_summary_metrics file in the DNAnexus project (usually in the qc folder). This contains metrics produced by Picard Tools's CollectAlignmentSummaryMetrics program. """ filename = barcode + ".alignment_summary_metrics" # In the call to dxpy.find_one_data_object() below, I'd normally set the # more_ok parameter to False, but this blows-up in Python 3.7 - giving me a RuntimeError. # So, I just won't set it for now. I think dxpy is still mainly a Python 2.7 library and # can break in later version of Python3. try: file_id = dxpy.find_one_data_object(zero_ok=False, project=self.dx_project_id, name=filename)["id"] except dxpy.exceptions.DXSearchError as err: msg = "Picard alignment summary metrics for barcode {} in DX project {} not found.".format( barcode, self.dx_project_id) debug_logger.error(msg) raise DxMissingAlignmentSummaryMetrics(msg) fh = StringIO(dxpy.open_dxfile(file_id).read()) asm = picard.CollectAlignmentSummaryMetrics(fh) return asm.metrics
def main(**job_inputs): if 'prefix' not in job_inputs: bam_name = dxpy.describe(job_inputs['illumina_bam'])['name'] if bam_name.endswith("cram"): prefix = bam_name[:-5] else: prefix = bam_name[:-4] else: prefix = job_inputs['prefix'] # Running Docker image subprocess.check_call( ['mkdir', '-p', '/home/dnanexus/in', '/home/dnanexus/out']) print "Starting Docker" input_bam = dxpy.open_dxfile(job_inputs['illumina_bam']) bam_name = "/home/dnanexus/in/{0}".format(input_bam.name) dxpy.download_dxfile(input_bam, bam_name) ref_genome = dxpy.open_dxfile(job_inputs['ref_fasta']) ref_name = "/home/dnanexus/in/{0}".format(ref_genome.name) dxpy.download_dxfile(ref_genome, ref_name) docker_call = [ 'dx-docker', 'run', '-v', '/home/dnanexus/in/:/home/dnanexus/in/', '-v', '/home/dnanexus/out/:/home/dnanexus/out/', 'parliament2', '--bam', bam_name, '-r', ref_name, '--prefix', str(prefix) ] if 'illumina_bai' in job_inputs: input_bai = dxpy.open_dxfile(job_inputs['illumina_bai']) bai_name = "/home/dnanexus/in/{0}".format(input_bai.name) dxpy.download_dxfile(input_bai, bai_name) docker_call.extend(['--bai', bai_name]) if job_inputs['filter_short_contigs']: docker_call.append('--filter_short_contigs') if job_inputs['run_breakdancer']: docker_call.append('--breakdancer') if job_inputs['run_breakseq']: docker_call.append('--breakseq') if job_inputs['run_manta']: docker_call.append('--manta') if job_inputs['run_cnvnator']: docker_call.append('--cnvnator') if job_inputs['run_lumpy']: docker_call.append('--lumpy') if job_inputs['run_delly_inversion']: docker_call.append('--delly_inversion') if job_inputs['run_delly_insertion']: docker_call.append('--delly_insertion') if job_inputs['run_delly_deletion']: docker_call.append('--delly_deletion') if job_inputs['run_delly_duplication']: docker_call.append('--delly_duplication') if job_inputs['run_genotype_candidates']: docker_call.append('--genotype') if job_inputs['run_svviz']: docker_call.append('--svviz') if job_inputs['svviz_only_validated_candidates']: docker_call.append('--svviz_only_validated_candidates') subprocess.check_call(docker_call) print "Docker image finished" sv_caller_results_names = glob.glob( '/home/dnanexus/out/sv_caller_results/*') sv_caller_results_upload = [] for name in sv_caller_results_names: sv_caller_results_upload.append( dxpy.dxlink(dxpy.upload_local_file(name))) output = {'sv_caller_results': sv_caller_results_upload} subprocess.check_call(['ls', '-sh', '/home/dnanexus/out/svtyped_vcfs/']) if job_inputs['output_log_files'] and os.listdir( '/home/dnanexus/out/log_files/'): log_file_names = glob.glob('/home/dnanexus/out/log_files/*') log_file_upload = [] for name in log_file_names: log_file_upload.append(dxpy.dxlink(dxpy.upload_local_file(name))) output['log_files'] = log_file_upload if job_inputs['run_genotype_candidates']: svtyped_vcf_names = glob.glob('/home/dnanexus/out/svtyped_vcfs/*') svtyped_vcfs_upload = [] for name in svtyped_vcf_names: svtyped_vcfs_upload.append( dxpy.dxlink(dxpy.upload_local_file(name))) output['svtyped_vcfs'] = svtyped_vcfs_upload output['combined_genotypes'] = dxpy.dxlink( dxpy.upload_local_file( '/home/dnanexus/out/{0}.combined.genotyped.vcf'.format( prefix))) if job_inputs['run_svviz'] and os.path.isfile( '/home/dnanexus/out/{0}.svviz_outputs.tar.gz'.format(prefix)): output['svviz_outputs'] = dxpy.dxlink( dxpy.upload_local_file( '/home/dnanexus/out/{0}.svviz_outputs.tar.gz'.format(prefix))) return output
report_link = dxencode.find_file(report_fn, project.get_id()) metrics = {} try: with dxpy.open_dxfile(report_link) as rfd: total = re.compile('Total Sequences\s+(\d+)') for line in rfd: m = total.match(line) if m: metrics.update({ 'Total Sequences': m.group(1) }) except Exception, e: print "ERROR: Could not read FastQC summary: %s (%s) \n%s" % (summary_fn, summary_link, e) metrics.update({'Total Sequences': -999.999 }) try: with dxpy.open_dxfile(summary_link) as sfd: fastqc = re.compile('(PASS|FAIL|WARN)\s+(.+)\s+ENCFF') for line in sfd: m = fastqc.match(line) if m: metrics.update({ m.group(2): m.group(1) }) except Exception, e: print "ERROR: Could not read FastQC report: %s (%s) \n%s" % (report_fn, report_link, e) #print json.dumps(metrics) return metrics def get_analysis_time(accession, repstr, project): result = list(dxpy.find_analyses(project=project, name='*'+accession+repstr+'*', name_mode='glob', describe=True, state='done'))