def setUpClass(cls): if RUN_JOB_ON_DX: if not project_name: print "'PROJ_NAME' environment variable must be defined!" sys.exit(1) working_project_id = dxpy.find_one_project(more_ok=False, name=project_name)["id"] run_args = {} run_args["project"] = working_project_id run_args["name"] = "vcfscope-measure on chr21" run_args["folder"] = "/purge/" + app_name input_hash = {} input_hash["vcfgz"] = dxpy.dxlink("file-BkkjFkj098Gb2jZ1Yx533JFv", project_id) input_hash["bam"] = dxpy.dxlink("file-Bkkjj5Q098Gkvkb3Xx5Pxj1J", project_id) input_hash["bai"] = dxpy.dxlink("file-Bkkjj5Q098GzYx2bG5YJ3z34", project_id) input_hash["region"] = dxpy.dxlink("file-Bkkj22Q098Gz5yK1Q955G5gX", project_id) app = dxpy.DXApp(name=app_name, alias="9.9.7") cls.job = app.run(input_hash, **run_args) else: job_id = "job-F1JpY9Q0pVj0BgpYBp14f31Q" cls.job = dxpy.DXJob(job_id) cls.job.wait_on_done()
def main(): inputs_file = open("inputs_stats.txt", 'w') print sys.argv[2] workflow = dxpy.DXWorkflow(sys.argv[2].split(":")[-1]) fh = dxpy.DXFile(sys.argv[1].split(":")[-1]) if "/Results" in fh.describe()['folder']: return app_id = sys.argv[3] if "applet" in app_id: app = dxpy.DXApplet(app_id) else: app = dxpy.DXApp(app_id) w_id = sys.argv[1].split(":")[1] existing_inputs = [] for item in workflow.describe()['stages'][0]['input']: existing_inputs.append(item) print existing_inputs for x in app.describe()['inputSpec']: print x if x['class'] == 'file' and x['name'] not in existing_inputs: inputs_file.write(x['name'] + "\n") inputs_file.close()
def __init__(self, name='bwa_mem_fastq_read_mapper', version='1.5.0'): # Currently doesn't look like search function allows option to search for particular version # Only version option is 'all_versions' boolean which indicates whether to get default or all self.name = name self.version = version self.dxid = None self.object = None # Get mapper app dxid app_generator = dxpy.find_apps( name=name, all_versions=False) # all_versions will not get most recent if not list(app_generator): # raise dxpy.AppError('Unable to find app called %s' % name) print 'Error: Could not find any app with name: %s' % name sys.exit() else: app_generator = dxpy.find_apps(name=name, all_versions=False) for app in app_generator: app_description = dxpy.api.app_describe(app['id']) app_version = app_description['version'] if app_version == self.version: self.dxid = app['id'] break else: print app_version if not self.dxid: print 'Could not find app: %s, version: %s' % (self.name, self.version) sys.exit() self.object = dxpy.DXApp( dxid=self.dxid) # bwa_mem : app-BXQy79Q0y7yQJVff3j9Y2B83
def get_handler_from_desc(desc): if desc['class'] == 'applet': return dxpy.DXApplet(desc['id'], project=desc['project']) elif desc['class'] == 'app': return dxpy.DXApp(dxid=desc['id']) else: return dxpy.DXWorkflow(desc['id'], project=desc['project'])
def get_exec_handler(path, alias=None): handler = None def get_handler_from_desc(desc): if desc['class'] == 'applet': return dxpy.DXApplet(desc['id'], project=desc['project']) elif desc['class'] == 'app': return dxpy.DXApp(dxid=desc['id']) else: return dxpy.DXWorkflow(desc['id'], project=desc['project']) if alias is None: app_desc = get_app_from_path(path) try: # Look for applets and workflows _project, _folderpath, entity_results = resolve_existing_path(path, expected='entity', ask_to_resolve=False, expected_classes=['applet', 'record', 'workflow'], visibility="visible") def is_applet_or_workflow(i): return (i['describe']['class'] in ['applet', 'workflow']) if entity_results is not None: entity_results = [i for i in entity_results if is_applet_or_workflow(i)] if len(entity_results) == 0: entity_results = None except ResolutionError: if app_desc is None: raise else: entity_results = None if entity_results is not None and len(entity_results) == 1 and app_desc is None: handler = get_handler_from_desc(entity_results[0]['describe']) elif entity_results is None and app_desc is not None: handler = get_handler_from_desc(app_desc) elif entity_results is not None: if not INTERACTIVE_CLI: raise ResolutionError('Found multiple executables with the path ' + path) print('Found multiple executables with the path ' + path) choice_descriptions = [get_ls_l_desc(r['describe']) for r in entity_results] if app_desc is not None: choice_descriptions.append('app-' + app_desc['name'] + ', version ' + app_desc['version']) choice = pick(choice_descriptions) if choice < len(entity_results): # all applet/workflow choices show up before the app, # of which there is always at most one possible choice handler = get_handler_from_desc(entity_results[choice]['describe']) else: handler = get_handler_from_desc(app_desc) else: raise ResolutionError("No matches found for " + path) else: if path.startswith('app-'): path = path[4:] handler = dxpy.DXApp(name=path, alias=alias) return handler
def run_bwa_mem(sample, fastq_dict, mapper_app_dxid, ref_genome_index, project_id): ''' Description: Maps sample fastq files to a reference genome Args: sample (dict) - sample[<barcode>] = [<fastq files>] mapper (dxid) ref_genome (dxid) ''' ## Stock DNAnexus BWA-MEM app #mapper_app_name = 'bwa_mem_fastq_read_mapper' #mapper_app_version = '1.5.0' #mapper_app = MapperApp(name=mapper_app_name, version=mapper_app_version) # DXApp object dxpy.set_workspace_id(project_id) # Create dict to store mapper app inputs mapper_app = dxpy.DXApp(mapper_app_dxid) mapper_input = { 'genomeindex_targz': dxpy.dxlink(ref_genome_index) } # hg19 : file-B6qq53v2J35Qyg04XxG0000V # Add fastq files to mapper app input dict if len(fastq_dict) == 0: print 'Error: No fastq files listed for sample %s' % sample sys.exit() elif len(fastq_dict) == 1: mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1']) elif len(fastq_dict) == 2: mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1']) mapper_input['reads2_fastqgz'] = dxpy.dxlink(fastq_dict['2']) else: print 'Error: More than 2 fastq files passed for mapping sample %s' % sample sys.exit() print mapper_input mapper_job = mapper_app.run(mapper_input) mapper_output = { "BAM": { "job": mapper_job.get_id(), "field": "sorted_bam" }, "BAI": { "job": mapper_job.get_id(), "field": "sorted_bai" } } return mapper_output
def map_contaminant(Contig, Reads): # get ID of our mapper try: bwa = dxpy.DXApp( dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id']) except StopIteration: raise dxpy.AppError( "Unable to find app 'bwa_mem_fastq_read_mapper'. Please install it to enable contaminant mapping" ) # TODO: find optimal chunk size so we don't launch too many bwa jobs map_job = bwa.run({ "reads": Reads, "reference": Contig, "discard_unmapped_rows": True, "chunk_size": 10000000 }) total_reads = 0 for r in Reads: desc = dxpy.DXGTable(r).describe() current_reads = desc['length'] if 'sequence2' in desc['columns']: current_reads *= 2 total_reads += current_reads # launch a job to wait for the mapping and will calculate what % has mapped calc_job = dxpy.new_dxjob( { "num_reads": total_reads, "mappings": { "job": map_job.get_id(), "field": "mappings" } }, "calc_contam") return calc_job.get_id()
This script calls the DNAnexus app I built called SCGPM Clean Raw Data at https://platform.dnanexus.com/app/scgpm_clean_raw_dataRemoves to unwanted files (that drive up the storage costs) from the raw_data folder of a DNAnexus project containing sequencing results from the SCGPM sequencing workflow. Most of the files in the raw_data folder are removed. Moreover, the lane tarball is removed; the XML files RunInfo.xml and runParameters.xml are extracted from Interop.tar and then the tarball is removed; finally, metadata.tar is removed. The extracted XML files are uploaded back to the raw_data folder. Queryies DNAnexus for all projects billed to the specified org and that were created within the last -d days. You must have the environemnt variable DX_SECURITY_CONTEXT set (described at http://autodoc.dnanexus.com/bindings/python/current/dxpy.html?highlight=token) in order to authenticate with DNAnexus. """ import subprocess import argparse import dxpy RAW_DATA_FOLDER = "/raw_data" #The raw_data folder location in a SCGPM DNAnexus project. APP_NAME = "scgpm_clean_raw_data" #App's name on DNAnexus APP = dxpy.DXApp(name="scgpm_clean_raw_data") def get_parser(): parser = argparse.ArgumentParser(description=__doc__,formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-d',"--days-ago",type=int,default=30, help=""" The number of days ago to query for new projects that are billed to the org specified by --org.""") parser.add_argument('-o',"--org",required=True,help=""" Limits the project search to only those that belong to the specified DNAnexus org. Should begin with 'org-'.""") return parser def main(): parser = get_parser() args = parser.parse_args() days_ago = args.days_ago
def test_base_input(self): job = dxpy.DXApp(self.app_id).run(self.base_input) print "Waiting for job to complete" job.wait_on_done() print json.dumps(job.describe()["output"])
def main(**job_inputs): print "Beginning processing of RNA data" output = {} check_reads(job_inputs['reads']) # Convert reads tables to FASTQ/FASTA files left_reads = [] right_reads = [] current_reads = 0 for reads in job_inputs['reads']: print "Converting reads table " + str(reads['$dnanexus_link']) left, right = dump_fastqa(reads['$dnanexus_link'], "reads_" + str(current_reads)) left_reads.append(left) if right != None: right_reads.append(right) current_reads += 1 # Convert Genes Object to GFF file run_shell("dx-genes-to-gtf --output genes.gtf " + job_inputs['gene_model']['$dnanexus_link']) # Create or download indexed genome genome = dxpy.DXRecord(job_inputs['reference']) if not 'indexed_reference' in job_inputs: output['indexed_reference'] = dxpy.dxlink( make_indexed_reference(genome.get_id())) else: output['indexed_reference'] = job_inputs['indexed_reference'] indexed_genome = dxpy.DXRecord(job_inputs['indexed_reference']) dxpy.download_dxfile(indexed_genome.get_details()['index_archive'], "reference.tar.xz") run_shell("tar -xJf reference.tar.xz") # call tophat num_cpus = multiprocessing.cpu_count() cmd = " ".join([ 'tophat', "-p", str(num_cpus), job_inputs['tophat_options'], "-G genes.gtf", "--transcriptome-index=./genes", "-T", "indexed_ref", " ", ",".join(left_reads) ]) if len(right_reads) != 0: cmd += " " + ",".join(right_reads) # Invoke tophat2 with FASTQ/A file(s) and indexed reference try: run_shell(cmd) except: raise dxpy.AppError( "Error while running Tophat. This could be caused by an incompatible gene model and reference or incorrect optional parameters. Please check that these are all correct" ) # upload and import the BAM as a Mappings table accepted_hits_file = dxpy.upload_local_file('tophat_out/accepted_hits.bam', wait_on_close=True) name = job_inputs['output_name'] name += "_mappings" sam_importer = dxpy.DXApp(name="sam_importer") print "Importing BAM output of Tophat" import_job = sam_importer.run({ "file": dxpy.dxlink(accepted_hits_file.get_id()), "reference_genome": dxpy.dxlink(genome.get_id()), "name": name }) cuff_cmd = " ".join( ['cufflinks', '-p', str(num_cpus), '-G genes.gtf', '-o cuff']) if 'cufflinks_options' in job_inputs: cuff_cmd += " " + job_inputs['cufflinks_options'] cuff_cmd += " tophat_out/accepted_hits.bam" # now with mapped reads in hand we can run cufflinks try: run_shell(cuff_cmd) except: raise dxpy.AppError( "Error while running Cufflinks. Please check that your parameters are valid" ) print "Packing, uploading, and parsing cufflinks output" # package cufflinks output run_shell("tar -czf cufflinks_output.tar.gz cuff/") cuff_name = job_inputs['output_name'] + "_cufflinks_output.tar.gz" orig_trans_file = dxpy.upload_local_file("cufflinks_output.tar.gz") orig_trans_file.rename(cuff_name) transcripts_table = upload_transcripts_file('cuff/genes.fpkm_tracking', job_inputs['output_name']) output['mappings'] = {"job": import_job.get_id(), "field": "mappings"} output['transcripts'] = dxpy.dxlink(transcripts_table.get_id()) output['cufflinks_output'] = dxpy.dxlink(orig_trans_file.get_id()) print "DONE!" return output