def get_lane_input_files(self): metadata_tar = '%s.metadata.tar*' % self.run_name self.metadata_tar_id = dxpy.find_one_data_object(classname = 'file', name = metadata_tar, name_mode = 'glob', project = self.project_id, folder = '/raw_data', zero_ok = False, more_ok = True )['id'] lane_tar = '%s_L%d.tar*' % (self.run_name, self.lane_index) self.lane_tar_id = dxpy.find_one_data_object(classname = 'file', name = lane_tar, name_mode = 'glob', project = self.project_id, folder = '/raw_data', zero_ok = False, more_ok = True )['id'] interop_tar = '%s.InterOp.tar*' % (self.run_name) self.interop_tar_id = dxpy.find_one_data_object(classname = 'file', name = interop_tar, name_mode = 'glob', project = self.project_id, folder = '/raw_data', zero_ok = False, more_ok = True )['id']
def test_paired_with_contam(self): bed_file = dxpy.find_one_data_object( name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object( name="SRR018256_paired_RNA_Mappings", typename="LetterMappings")['id'] contam_contig = dxpy.find_one_data_object(name="human rRNA", typename="ContigSet")['id'] reads = dxpy.find_one_data_object(name="SRR018256_reads", typename="LetterReads")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find Mappings. Please upload them" return False if contam_contig == None: print "Cannot find human rRNA. Please upload it" return False if reads == None: print "Cannot find SRR018256_reads. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file), 'contaminants': [dxpy.dxlink(contam_contig)], 'original_reads': [dxpy.dxlink(reads)] } print "Running program with", input job = self.program.run(input) print "launched test_paired_with_contam ", job.get_id()
def test_paired_with_contam(self): bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object(name="SRR018256_paired_RNA_Mappings", typename="LetterMappings")['id'] contam_contig = dxpy.find_one_data_object(name="human rRNA", typename="ContigSet")['id'] reads = dxpy.find_one_data_object(name="SRR018256_reads", typename="LetterReads")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find Mappings. Please upload them" return False if contam_contig == None: print "Cannot find human rRNA. Please upload it" return False if reads == None: print "Cannot find SRR018256_reads. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file), 'contaminants': [dxpy.dxlink(contam_contig)], 'original_reads': [dxpy.dxlink(reads)] } print "Running program with", input job = self.program.run(input) print "launched test_paired_with_contam ", job.get_id()
def resolve_file(identifier): logging.debug("resolve_file: %s" %(identifier)) if not identifier: return None m = re.match(r'''^([\w\-\ \.]+):([\w\-\ /\.]+)''', identifier) if m: project_identifier = m.group(1) file_identifier = m.group(2) else: logging.debug("Defaulting to the current project") project_identifier = dxpy.WORKSPACE_ID file_identifier = identifier project = resolve_project(project_identifier) logging.debug("Got project %s" %(project.name)) logging.debug("Now looking for file %s" %(file_identifier)) m = re.match(r'''(^[\w\-\ /\.]+)/([\w\-\ \.]+)''', file_identifier) if m: folder_name = m.group(1) if not folder_name.startswith('/'): folder_name = '/' + folder_name file_name = m.group(2) else: # folder_name = '/' folder_name = None file_name = file_identifier logging.debug("Looking for file %s in folder %s" %(file_name, folder_name)) try: if folder_name: file_handler = dxpy.find_one_data_object(name=file_name, folder=folder_name, project=project.get_id(), recurse=False, more_ok=False, zero_ok=False, return_handler=True) else: file_handler = dxpy.find_one_data_object(name=file_name, project=project.get_id(), folder='/', recurse=True, more_ok=False, zero_ok=False, return_handler=True) except dxpy.DXSearchError: logging.debug('%s not found in project %s folder %s. Trying as file ID' %(file_name, project.get_id(), folder_name)) try: file_handler = dxpy.DXFile(dxid=identifier, mode='r') except: logging.debug('%s not found as a dxid' %(identifier)) try: file_handler = resolve_accession(identifier) except: logging.debug('%s not found as an accession' %(identifier)) logging.warning('Could not find file %s.' %(identifier)) return None except: raise logging.info("Resolved file identifier %s to %s" %(identifier, file_handler.get_id())) return file_handler
def spp(experiment, control, xcor_scores, chrom_sizes, spp_version, bigbed=False, as_file=None, name="spp", prefix=None, fragment_length=None, spp_instance=None): spp_applet = \ dxpy.find_one_data_object( classname='applet', name='spp', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) spp_input = {"experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "bigbed": bigbed, "chrom_sizes": chrom_sizes, "spp_version": spp_version} if fragment_length is not None: spp_input.update({"fragment_length": fragment_length}) if bigbed and as_file: spp_input.update({"as_file": as_file}) if prefix: spp_input.update({"prefix": prefix}) spp_args = { 'name': name, } if spp_instance: spp_args.update({ 'instance_type': spp_instance }) return spp_applet.run(spp_input, **spp_args)
def resolve_accession(accession): logging.debug("Looking for accession %s" %(accession)) if not re.match(r'''^ENCFF\d{3}[A-Z]{3}''', accession): logging.debug("%s is not a valid accession format" %(accession)) raise ValueError(accession) DNANEXUS_ENCODE_SNAPSHOT = 'ENCODE-SDSC-snapshot-20140505' logging.debug('Testing') try: snapshot_project except: logging.debug('Looking for snapshot project %s' %(DNANEXUS_ENCODE_SNAPSHOT)) try: project_handler = resolve_project(DNANEXUS_ENCODE_SNAPSHOT) global snapshot_project snapshot_project = project_handler except: logging.error("Cannot find snapshot project %s" %(DNANEXUS_ENCODE_SNAPSHOT)) raise ValueError(DNANEXUS_ENCODE_SNAPSHOT) logging.debug('Found snapshot project %s' %(snapshot_project.name)) try: accession_search = accession + '*' logging.debug('Looking recursively for %s in %s' %(accession_search, snapshot_project.name)) file_handler = dxpy.find_one_data_object( name=accession_search, name_mode='glob', more_ok=False, classname='file', recurse=True, return_handler=True, folder='/', project=snapshot_project.get_id()) logging.debug('Got file handler for %s' %(file_handler.name)) return file_handler except: logging.error("Cannot find accession %s in project %s" %(accession, snapshot_project.name)) raise ValueError(accession)
def test_unpaired(self): bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object(name="unpaired_RNA-Seq_mappings", typename="LetterMappings")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find unpaired_RNA-Seq_mappings. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file) } print "Running program with", input job = self.program.run(input) print "launched test_unpaired ", job.get_id()
def macs2(experiment, control, xcor_scores, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, prefix=None): macs2_applet = dxpy.find_one_data_object(classname='applet', name='macs2', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) macs2_input = { "experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "chrom_sizes": chrom_sizes, "narrowpeak_as": narrowpeak_as, "gappedpeak_as": gappedpeak_as, "broadpeak_as": broadpeak_as, "genomesize": genomesize } if prefix: macs2_input.update({'prefix': prefix}) return macs2_applet.run(macs2_input)
def download_qc_report(self, download_dir): """ Downloads the QC report from the DNAnexus sequencing results project. Args: download_dir: `str` - The local directory path to download the QC report to. Returns: `str`. The filepath to the downloaded QC report. """ if not os.path.isdir(download_dir): os.makedirs(download_dir) res = dxpy.find_one_data_object(project=self.dx_project_id, folder=self.DX_QC_REPORT_FOLDER, name="*_QC_Report.pdf", name_mode="glob") #res will be something like {u'project': u'project-BzqVkxj08kVZbPXk54X0P2JY', u'id': u'file-BzqVkg800Fb0z4437GXJfGY6'} #dxpy.find_one_data_object() raises a dxpy.exceptions.DXSearchError() if nothing is found. dx_file = dxpy.DXFile(dxid=res["id"], project=res["project"]) download_file_name = os.path.join(download_dir, dx_file.name) msg = "{filename} to {download_dir}.".format(filename=dx_file.name, download_dir=download_dir) debug_logger.debug("Downloading " + msg) dxpy.bindings.dxfile_functions.download_dxfile( dxid=dx_file, filename=download_file_name) success_logger.info("Downloaded " + msg) return download_file_name
def spp(experiment, control, xcor_scores, chrom_sizes, spp_version, bigbed=False, as_file=None, name="spp", prefix=None, fragment_length=None): spp_applet = \ dxpy.find_one_data_object( classname='applet', name='spp', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) spp_input = { "experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "bigbed": bigbed, "chrom_sizes": chrom_sizes, "spp_version": spp_version } if fragment_length is not None: spp_input.update({"fragment_length": fragment_length}) if bigbed and as_file: spp_input.update({"as_file": as_file}) if prefix: spp_input.update({"prefix": prefix}) return spp_applet.run(spp_input, name=name)
def macs2(experiment, control, xcor_scores, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, name="MACS2", prefix=None, fragment_length=None): macs2_applet = dxpy.find_one_data_object( classname='applet', name='macs2', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) macs2_input = { "experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "chrom_sizes": chrom_sizes, "narrowpeak_as": narrowpeak_as, "gappedpeak_as": gappedpeak_as, "broadpeak_as": broadpeak_as, "genomesize": genomesize } if prefix: macs2_input.update({'prefix': prefix}) if fragment_length is not None: macs2_input.update({'fragment_length': fragment_length}) return macs2_applet.run(macs2_input, name=name)
def find_applet_by_name(applet_name): '''Looks up an applet by name in the project that holds tools. From Joe Dale's code.''' found = dxpy.find_one_data_object(classname="applet", name=applet_name, project=ENCODE_CHIP_SEQ_PROJECT, zero_ok=False, more_ok=False, return_handler=True) print "Resolved %s to %s" % (applet_name, found.get_id()) return found
def spp(experiment, control, xcor_scores): spp_applet = dxpy.find_one_data_object( classname='applet', name='spp', zero_ok=False, more_ok=False, return_handler=True) return spp_applet.run( {"experiment": experiment, "control": control, "xcor_scores_input": xcor_scores}, instance_type="mem2_ssd1_x8")
def find_applet(applet_name): return dxpy.find_one_data_object(classname='applet', name=applet_name, project=project.get_id(), folder=applets_folder, zero_ok=False, more_ok=False, return_handler=True)
def run_gene_ebseq(treatment_folder,control_folder,patient_id): """ Runs the 'rsem-ebseq gene expression' workflow I wrote. """ gene_glob = "*.genes.results" quant_file = dxpy.find_one_data_object(more_ok=False,project=jo_wu_project,folder=treatment_folder,name=gene_glob,name_mode="glob") quant_file = dxpy.dxlink(project_id=quant_file['project'],object_id=quant_file['id']) control_quant_file = dxpy.find_one_data_object(more_ok=False,project=jo_wu_project,folder=control_folder,name=gene_glob,name_mode="glob") control_quant_file = dxpy.dxlink(project_id=control_quant_file['project'],object_id=control_quant_file['id']) #the quant_files are the isoforms gene expression files created by rsem-calculate-expression treatment_barcode = os.path.split(treatment_folder)[-1] control_barcode = os.path.split(control_folder)[-1] wf = dxpy.DXWorkflow(project="project-BxxYqbQ0v3VQz5z2bvFyF6YV",dxid="workflow-ByFG8g00v3VpjkVxP0gkpX7X") destination_folder = os.path.join(treatment_folder,"ebseq") workflow_input = {"0.results":[quant_file,control_quant_file],"1.conditions": "1,1"} job_properties = {"patient_id":patient_id,"control_barcode":control_barcode,"treatment_barcode":treatment_barcode} job_name = "_".join([patient_id,treatment_barcode,control_barcode,wf.name]) wf.run(debug={"debugOn":["AppError","AppInternalError"]},workflow_input=workflow_input,project=jo_wu_project,folder=destination_folder,name=job_name,properties=job_properties)
def get_reference_ids(self): reference_genome_project = 'project-F3x6Zf89QqxF6vjK0qfkJG1y' self.reference_genome_dxid = dxpy.find_one_data_object(classname='file', name='genome.fa.gz', name_mode='exact', project = reference_genome_project, folder = '/%s' % self.reference_genome, zero_ok = False, more_ok = False )['id'] self.reference_index_dxid = dxpy.find_one_data_object(classname='file', name='bwa_index.tar.gz', name_mode='exact', project = reference_genome_project, folder = '/%s' % self.reference_genome, zero_ok = False, more_ok = False )['id']
def xcor_only(tags, paired_end): xcor_only_applet = dxpy.find_one_data_object(classname='applet', name='xcor_only', zero_ok=False, more_ok=False, return_handler=True) return xcor_only_applet.run({ "input_tagAlign": tags, "paired_end": paired_end })
def find_applet_by_name(applet_name, applets_project_id=APPLETS_PROJECT_ID): if (applet_name, applets_project_id) not in APPLETS: found = dxpy.find_one_data_object(classname="applet", name=applet_name, project=applets_project_id, zero_ok=False, more_ok=False, return_handler=True) APPLETS[(applet_name, applets_project_id)] = found return APPLETS[(applet_name, applets_project_id)]
def run_isoform_ebseq(treatment_folder,control_folder,patient_id): """ Runs the 'rsem-ebseq isoform expression' workflow I wrote. """ isoform_glob = "*.isoforms.results" quant_file = dxpy.find_one_data_object(more_ok=False,project=jo_wu_project,folder=treatment_folder,name=isoform_glob,name_mode="glob") quant_file = dxpy.dxlink(project_id=quant_file['project'],object_id=quant_file['id']) control_quant_file = dxpy.find_one_data_object(more_ok=False,project=jo_wu_project,folder=control_folder,name=isoform_glob,name_mode="glob") control_quant_file = dxpy.dxlink(project_id=control_quant_file['project'],object_id=control_quant_file['id']) #the quant_files are the isoforms gene expression files created by rsem-calculate-expression treatment_barcode = os.path.split(treatment_folder)[-1] control_barcode = os.path.split(control_folder)[-1] wf = dxpy.DXWorkflow(project="project-BxxYqbQ0v3VQz5z2bvFyF6YV",dxid="workflow-ByFPY900v3Vbb8XJVxvzJ1f7") destination_folder = os.path.join(treatment_folder,"ebseq") transcripts_file = dxpy.dxlink(project_id="project-BxZpbXj0V610b5Q6x1FV80gb",object_id="file-ByBpjPQ0V61585gqXkvQ11Z5") workflow_input = {"0.results":[quant_file,control_quant_file],"1.input_fasta_file":transcripts_file,"1.output_name":"isoforms.results","2.conditions": "1,1"} job_properties = {"patient_id":patient_id,"control_barcode":control_barcode,"treatment_barcode":treatment_barcode} job_name = "_".join([patient_id,treatment_barcode,control_barcode,wf.name]) wf.run(debug={"debugOn":["AppError","AppInternalError"]},workflow_input=workflow_input,project=jo_wu_project,folder=destination_folder,name=job_name,properties=job_properties)
def find_reference_file_by_name(reference_name): '''Looks up a reference file by name in the project that holds common tools. From Joe Dale's code.''' found = dxpy.find_one_data_object(classname="file", name=reference_name, project=ENCODE_CHIP_SEQ_PROJECT, folder='/Reference Data', recurse=True, zero_ok=False, more_ok=False, return_handler=True) print "Resolved %s to %s" % (reference_name, found.get_id()) return dxpy.dxlink(found)
def pooled(files): pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('input files:%s' %(files)) logger.debug('input file ids:%s' %([dxf.get_id() for dxf in files])) logger.debug('input files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in files])) pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in files]}) pooled_file = pool_subjob.get_output_ref("pooled") return pooled_file
def test_unpaired(self): bed_file = dxpy.find_one_data_object( name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object(name="unpaired_RNA-Seq_mappings", typename="LetterMappings")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find unpaired_RNA-Seq_mappings. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file) } print "Running program with", input job = self.program.run(input) print "launched test_unpaired ", job.get_id()
def xcor_only(tags, paired_end): xcor_only_applet = dxpy.find_one_data_object( classname='applet', name='xcor_only', zero_ok=False, more_ok=False, return_handler=True) return xcor_only_applet.run({ "input_tagAlign": tags, "paired_end": paired_end })
def find_interop_file(self): ''' DERECATED ''' interop_name = '%s.InterOp.tar.gz' % self.run_name interop_file = dxpy.find_one_data_object(classname='file', name=interop_name, name_mode='exact', project=self.project_id, folder='/', zero_ok=False, more_ok=True) return interop_file['id']
def find_applet_by_name(applet_name, applets_project_id=APPLETS_PROJECT_ID): if (applet_name, applets_project_id) not in APPLETS: found = dxpy.find_one_data_object( classname="applet", name=applet_name, project=applets_project_id, zero_ok=False, more_ok=False, return_handler=True) APPLETS[(applet_name, applets_project_id)] = found return APPLETS[(applet_name, applets_project_id)]
def spp(experiment, control, xcor_scores, chrom_sizes, bigbed=False, as_file=None): spp_applet = dxpy.find_one_data_object( classname='applet', name='spp', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) spp_input = {"experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "bigbed": bigbed, "chrom_sizes": chrom_sizes} if bigbed and as_file: spp_input.update({"as_file": as_file}) return spp_applet.run(spp_input)
def find_applet_by_name(applet_name, applets_project_id): '''Looks up an applet by name in the project that holds tools. From Joe Dale's code.''' cached = '*' if (applet_name, applets_project_id) not in APPLETS: found = dxpy.find_one_data_object(classname="applet", name=applet_name, project=applets_project_id, zero_ok=False, more_ok=False, return_handler=True) APPLETS[(applet_name, applets_project_id)] = found cached = '' logging.info(cached + "Resolved applet %s to %s" %(applet_name, APPLETS[(applet_name, applets_project_id)].get_id())) return APPLETS[(applet_name, applets_project_id)]
def xcor_only(tags, paired_end, spp_version=None, name='xcor_only'): xcor_only_applet = \ dxpy.find_one_data_object( classname='applet', name='xcor_only', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) applet_input = {"input_tagAlign": tags, "paired_end": paired_end} if spp_version: applet_input.update({'spp_version': spp_version}) return xcor_only_applet.run(applet_input, name=name)
def macs2(experiment, control, xcor_scores, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize): macs2_applet = dxpy.find_one_data_object( classname='applet', name='macs2', project=dxpy.PROJECT_CONTEXT_ID, project=zero_ok=False, more_ok=False, return_handler=True) macs2_input = { "experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "chrom_sizes": chrom_sizes, "narrowpeak_as": narrowpeak_as, "gappedpeak_as": gappedpeak_as, "broadpeak_as": broadpeak_as, "genomesize": genomesize } return macs2_applet.run(macs2_input)
def xcor_only(tags, paired_end, name='xcor_only'): xcor_only_applet = \ dxpy.find_one_data_object( classname='applet', name='xcor_only', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) return xcor_only_applet.run( {"input_tagAlign": tags, "paired_end": paired_end}, name=name)
def resolve_file(identifier, server, keypair): logger.debug("resolve_file: %s" % (identifier)) assert identifier, "No file identifier passed to resolve_file" m = re.match(r'''^([\w\-\ \.]+):([\w\-\ /\.]+)''', identifier) if m: # fully specified with project:path project_identifier = m.group(1) file_identifier = m.group(2) else: logger.debug("Defaulting to the current project") project_identifier = DATA_CACHE_PROJECT file_identifier = identifier project = resolve_project(project_identifier) logger.debug("Got project %s" % (project.name)) logger.debug("Now looking for file %s" % (file_identifier)) m = re.match(r'''(^[\w\-\ /\.]+)/([\w\-\ \.]+)''', file_identifier) if m: folder_name = m.group(1) if not folder_name.startswith('/'): folder_name = '/' + folder_name file_name = m.group(2) else: folder_name = '/fastqs/' file_name = file_identifier + '.fastq.gz' logger.debug("Looking for file %s in folder %s" % (file_name, folder_name)) try: file_handler = dxpy.find_one_data_object(name=file_name, folder=folder_name, project=project.get_id(), more_ok=False, zero_ok=False, return_handler=True) except: logger.debug('%s not found in project %s folder %s' % (file_name, project.get_id(), folder_name)) try: # maybe it's just filename in the default workspace file_handler = dxpy.DXFile(dxid=identifier, mode='r') except: logger.debug('%s not found as a dxid' % (identifier)) file_handler = resolve_accession(identifier, server, keypair) assert file_handler, "Failed to resolve file identifier %s" % (identifier) logger.debug("Resolved file identifier %s to %s" % (identifier, file_handler.name)) return file_handler
def xcor_only(tags, paired_end, name='xcor_only'): xcor_only_applet = \ dxpy.find_one_data_object( classname='applet', name='xcor_only', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) return xcor_only_applet.run( { "input_tagAlign": tags, "paired_end": paired_end }, name=name)
def patch(obs): for fob in obs: if fob['file_format'] == 'fastq' or fob['status'] == 'revoked': continue fn = fob['submitted_file_name'] folder = dxpy.describe(dxpy.find_one_data_object(name=fn.strip('/'), project='project-BQkYKg00F1GP55qQ9Qy00VP0')['id'])['folder'] newfn = folder+'/'+fn.strip('/') print "Patch: %s with %s" % (fn, newfn) res = requests.patch(srv+fob['@id'], auth=(id,pw), data=json.dumps({'submitted_file_name': newfn}),headers={'content-type': 'application/json'}) try: res.raise_for_status() print "Success" except Exception, e: print "Failed %s" % e
def find_record(run_name, project): """ Wrapper to find the sentinel record for a given run_name in the given DNAnexus project""" try: record = dxpy.find_one_data_object(classname="record", name="*{0}*".format(run_name), project=project, folder="{0}/{1}/{2}".format(RUN_UPLOAD_DEST.rstrip('/'), run_name, REMOTE_RUN_FOLDER), name_mode="glob", return_handler=True, more_ok=False, zero_ok=False) return record # Either zero or multiple records found, in cases where we cannot resolve uniquely # the upload sentinel, we exit the program with an error except dxpy.exceptions.DXSearchError, e: sys.exit("Unexpected result when searching for upload sentinel of run {0}. {1}".format(run_name, e))
def main(reads1, bwa_aln_params, bwa_version, samtools_version, reads2, reference_tar, key, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) #for each input fastq decide if it's specified as an ENCODE file accession number (ENCFF*) reads1_files = [resolve_file(read, key) for read in reads1] if len(reads1_files) > 1: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('reads1_files:%s' %(reads1_files)) logger.debug('reads1_files ids:%s' %([dxf.get_id() for dxf in reads1_files])) logger.debug('reads1_files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in reads1_files])) pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in reads1_files]}) reads1_file = pool_subjob.get_output_ref("pooled") else: reads1_file = reads1_files[0] reads2_file = resolve_file(reads2, key) reference_tar_file = resolve_file(reference_tar, key) logger.info('Resolved reads1 to %s', reads1_file) if reads2: logger.info('Resolved reads2 to %s', reads2_file) logger.info('Resolved reference_tar to %s', reference_tar_file) output = {} output.update({'reads1': reads1_file}) if reads2: output.update({"reads2": reads2_file}) output_json = { "reads1": reads1_file, "reference_tar": reference_tar_file, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "samtools_version": samtools_version } if reads2: output_json.update({'reads2': reads2_file}) output.update({'output_JSON': output_json}) #logger.info('Exiting with output_JSON: %s' %(json.dumps(output))) #return {'output_JSON': json.dumps(output)} logger.info('Exiting with output: %s' %(output)) return output
def copy_files(fids, project_id, folder): new_fids = [] for file_dict in fids: f = dxpy.DXFile(dxid=file_dict['id'], project=file_dict['project']) fn = f.describe()['name'] # Check to see if file already exists. found_file = dxpy.find_one_data_object(classname='file', project=project_id, folder=folder, zero_ok=True, name=fn) if found_file is None: new_fids += [dxpy.dxlink(f.clone(project_id, folder))] else: new_fids += [dxpy.dxlink(found_file)] return new_fids
def find_reference_file_by_name(reference_name, project_name): '''Looks up a reference file by name in the project that holds common tools. From Joe Dale's code.''' project = dxpy.find_one_project(name=project_name, name_mode='exact', return_handler=False) cached = '*' if (reference_name, project['id']) not in REFERENCE_FILES: found = dxpy.find_one_data_object(classname="file", name=reference_name, project=project['id'], recurse=True, zero_ok=False, more_ok=False, return_handler=True) REFERENCE_FILES[(reference_name, project['id'])] = found cached = '' print cached + "Resolved %s to %s" % (reference_name, REFERENCE_FILES[(reference_name, project['id'])].get_id()) return dxpy.dxlink(REFERENCE_FILES[(reference_name, project['id'])])
def resolve_accession(accession, key): logger.debug("Looking for accession %s" % (accession)) if not re.match(r'''^ENCFF\d{3}[A-Z]{3}''', accession): logger.warning("%s is not a valid accession format" % (accession)) return None if DATA_CACHE_PROJECT: logger.debug('Looking for cache project %s' % (DATA_CACHE_PROJECT)) try: project_handler = resolve_project(DATA_CACHE_PROJECT) snapshot_project = project_handler except: logger.error("Cannot find cache project %s" % (DATA_CACHE_PROJECT)) snapshot_project = None logger.debug('Cache project: %s' % (snapshot_project)) if snapshot_project: try: accession_search = accession + '*' logger.debug('Looking recursively for %s in %s' % (accession_search, snapshot_project.name)) file_handler = dxpy.find_one_data_object( name=accession_search, name_mode='glob', more_ok=False, classname='file', recurse=True, return_handler=True, folder='/', project=snapshot_project.get_id()) logger.debug('Got file handler for %s' % (file_handler.name)) return file_handler except: logger.debug("Cannot find accession %s in project %s" % (accession, snapshot_project)) # we're here because we couldn't find the cache or couldn't find the file in the cache, so look in AWS dx_file = s3cp( accession, key) #this returns a link to the file in the applet's project context if not dx_file: logger.warning('Cannot find %s. Giving up.' % (accession)) return None else: return dx_file
def resolve_file(identifier, server, keypair): logger.debug("resolve_file: %s" % (identifier)) assert identifier, "No file identifier passed to resolve_file" m = re.match(r'''^([\w\-\ \.]+):([\w\-\ /\.]+)''', identifier) if m: # fully specified with project:path project_identifier = m.group(1) file_identifier = m.group(2) else: logger.debug("Defaulting to the current project") project_identifier = DATA_CACHE_PROJECT file_identifier = identifier project = resolve_project(project_identifier) logger.debug("Got project %s" % (project.name)) logger.debug("Now looking for file %s" % (file_identifier)) m = re.match(r'''(^[\w\-\ /\.]+)/([\w\-\ \.]+)''', file_identifier) if m: folder_name = m.group(1) if not folder_name.startswith('/'): folder_name = '/' + folder_name file_name = m.group(2) else: folder_name = '/fastqs/' file_name = file_identifier + '.fastq.gz' logger.debug("Looking for file %s in folder %s" % (file_name, folder_name)) try: file_handler = dxpy.find_one_data_object( name=file_name, folder=folder_name, project=project.get_id(), more_ok=False, zero_ok=False, return_handler=True) except: logger.debug( '%s not found in project %s folder %s' % (file_name, project.get_id(), folder_name)) try: # maybe it's just filename in the default workspace file_handler = dxpy.DXFile(dxid=identifier, mode='r') except: logger.debug('%s not found as a dxid' % (identifier)) file_handler = resolve_accession(identifier, server, keypair) assert file_handler, "Failed to resolve file identifier %s" % (identifier) logger.debug( "Resolved file identifier %s to %s" % (identifier, file_handler.name)) return file_handler
def get_barcode_stats(self, barcode): """ Loads the JSON in a ${barcode}_stats.json file in the DNAnexus project (usually in the qc folder). """ filename = barcode + "_stats.json" # In the call to dxpy.find_one_data_object() below, I'd normally set the # more_ok parameter to False, but this blows-up in Python 3.7 - giving me a RuntimeError. # So, I just won't set it for now. I think dxpy is still mainly a Python 2.7 library and # can break in later version of Python3. file_id = dxpy.find_one_data_object(zero_ok=False, project=self.dx_project_id, name=filename)["id"] json_data = json.loads(dxpy.open_dxfile(file_id).read()) return json_data
def pooled(files): pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('input files:%s' % (files)) logger.debug('input file ids:%s' % ([dxf.get_id() for dxf in files])) logger.debug('input files dxlinks:%s' % ([dxpy.dxlink(dxf) for dxf in files])) pool_subjob = pool_applet.run( {"inputs": [dxpy.dxlink(dxf) for dxf in files]}) pooled_file = pool_subjob.get_output_ref("pooled") return pooled_file
def test_mapping(): dxpy.set_workspace_id('project-BpBjyqQ0Jk0Xv2B11Q8P6X59') applet = dxpy.find_one_data_object( name='bwa_mem_fastq_read_mapper', classname='applet', return_handler=True, zero_ok=False, project='project-B406G0x2fz2B3GVk65200003') applet.run({ 'genomeindex_targz': dxpy.dxlink('file-B6qq53v2J35Qyg04XxG0000V'), 'reads_fastqgz': dxpy.dxlink('file-BpBjzFQ0Jk0Xk73YqQgJKg9Z'), 'reads2_fastqgz': dxpy.dxlink('file-BpBk0400Jk0Xk73YqQgJKg9f') })
def xcor_only(tags, paired_end, spp_version=None, name='xcor_only'): xcor_only_applet = \ dxpy.find_one_data_object( classname='applet', name='xcor_only', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) applet_input = { "input_tagAlign": tags, "paired_end": paired_end } if spp_version: applet_input.update({'spp_version': spp_version}) return xcor_only_applet.run(applet_input, name=name)
def resolve_accession(accession, server, keypair): logger.debug("Looking for accession %s" % (accession)) if not re.match(r'''^ENCFF\d{3}[A-Z]{3}''', accession): logger.warning("%s is not a valid accession format" % (accession)) return None if DATA_CACHE_PROJECT: logger.debug('Looking for cache project %s' % (DATA_CACHE_PROJECT)) try: project_handler = resolve_project(DATA_CACHE_PROJECT) snapshot_project = project_handler except: logger.error("Cannot find cache project %s" % (DATA_CACHE_PROJECT)) snapshot_project = None logger.debug('Cache project: %s' % (snapshot_project)) if snapshot_project: try: accession_search = accession + '*' logger.debug( 'Looking recursively for %s in %s' % (accession_search, snapshot_project.name)) file_handler = dxpy.find_one_data_object( name=accession_search, name_mode='glob', more_ok=False, classname='file', recurse=True, return_handler=True, folder='/', project=snapshot_project.get_id()) logger.debug('Got file handler for %s' % (file_handler.name)) return file_handler except: logger.debug( "Cannot find accession %s in project %s" % (accession, snapshot_project)) # we're here because we couldn't find the cache or couldn't find the file # in the cache, so look in AWS # this returns a link to the file in the applet's project context dx_file = s3_dxcp(accession, server, keypair) if not dx_file: logger.warning('Cannot find %s. Giving up.' % (accession)) return None else: return dx_file
def find_reference_file_by_name(reference_name, applets_project_id): '''Looks up a reference file by name in the project that holds common tools. From Joe Dale's code.''' cached = '*' if (reference_name, applets_project_id) not in REFERENCE_FILES: found = dxpy.find_one_data_object(classname="file", name=reference_name, project=applets_project_id, folder='/Reference Data', recurse=True, zero_ok=False, more_ok=False, return_handler=True) REFERENCE_FILES[(reference_name, applets_project_id)] = found cached = '' print cached + "Resolved %s to %s" % (reference_name, REFERENCE_FILES[ (reference_name, applets_project_id)].get_id()) return dxpy.dxlink(REFERENCE_FILES[(reference_name, applets_project_id)])
def get_run_details_json(self): """ Retrieves the JSON object for the stats in the file named run_details.json in the project specified by self.dx_project_id. Returns: JSON object of the run details. """ run_details_filename = "run_details.json" run_details_json_id = dxpy.find_one_data_object( more_ok=False, zero_ok=True, project=self.dx_project_id, name=run_details_filename)["id"] json_data = json.loads( dxpy.open_dxfile(dxid=run_details_json_id).read()) #dxpy.download_dxfile(show_progress=True,dxid=run_details_json_id,project=self.dx_project_id,filename=output_name) return json_data
def copy_files(fids, project, folder): new_fids = [] for fid in fids: (pid, fid) = fid.split(':') f = dxpy.DXFile(dxid=fid, project=pid) fn = f.describe()['name'] found_file = dxpy.find_one_data_object(classname='file', project=project.get_id(), folder=folder, zero_ok=True, name=fn) if found_file is None: new_fids += [dxpy.dxlink(f.clone(project.get_id(), folder))] else: new_fids += [dxpy.dxlink(found_file)] return new_fids
def resolve_dx_file(identifier): try: handler = dxpy.get_handler(identifier) except dxpy.DXError: try: handler = dxpy.find_one_data_object( classname='file', name=identifier, return_handler=True, zero_ok=False, more_ok=False) except dxpy.DXSearchError: logging.error('Failed to resolve control %s to unique dx object. ID or name does not exist or multiple files of that name were found.' % (str(identifier))) return None else: return handler else: return handler
def get_sample_stats_json(self, barcode=None): """ .. deprecated:: 0.1.0 GSSC has removed the sample_stats.json file since the entire folder it was in has been removed. Use :meth:`get_barcode_stats` instead. Retrieves the JSON object for the stats in the file named sample_stats.json in the project specified by self.dx_project_id. This file is located in the DNAnexus folder stage\d_qc_report. Args: barcode: `str`. The barcode for the sample. Currently, the sample_stats.json file is of the following form when there isn't a genome mapping: [{"Sample name": "AGTTCC"}, {"Sample name": "CAGATC"}, {"Sample name": "GCCAAT"}, ...}]. When there is a mapping, each dictionary has many more keys in addition to the "Sample name" one. Returns: `list` of dicts if barcode=None, otherwise a dict for the given barcode. """ sample_stats_json_filename = "sample_stats.json" sample_stats_json_id = dxpy.find_one_data_object( more_ok=False, zero_ok=False, project=self.dx_project_id, name=sample_stats_json_filename)["id"] #dxpy.download_dxfile(dxid=sample_stats_json_id,project=self.dx_project_id,filename=sample_stats_json_filename) json_data = json.loads(dxpy.open_dxfile(sample_stats_json_id).read()) if not barcode: return json_data for d in json_data: #d is a dictionary sample_barcode = d["Sample name"] if sample_barcode == barcode: return d if barcode: raise DnanexusBarcodeNotFound( "Barcode {barcode} for {library_name} not found in {sample_stats_json_filename} in project {project}." .format(barcode=barcode, library_name=self.library_name, sample_stats_json_filename=sample_stats_json_filename, project=self.dx_project_id))
def find_reference_file_by_name(reference_name, project_name): '''Looks up a reference file by name in the project that holds common tools. From Joe Dale's code.''' project = dxpy.find_one_project(name=project_name, name_mode='exact', return_handler=False) cached = '* ' if (reference_name, project['id']) not in REFERENCE_FILES: found = dxpy.find_one_data_object(classname="file", name=reference_name, project=project['id'], recurse=True, zero_ok=False, more_ok=False, return_handler=True) REFERENCE_FILES[(reference_name, project['id'])] = found cached = '' #print >> sys.stderr, cached + "Resolved %s to %s" % \ # (reference_name, REFERENCE_FILES[(reference_name, project['id'])].get_id()) return dxpy.dxlink(REFERENCE_FILES[(reference_name, project['id'])])
def patch(obs): for fob in obs: if fob['file_format'] == 'fastq' or fob['status'] == 'revoked': continue fn = fob['submitted_file_name'] folder = dxpy.describe( dxpy.find_one_data_object( name=fn.strip('/'), project='project-BQkYKg00F1GP55qQ9Qy00VP0')['id'])['folder'] newfn = folder + '/' + fn.strip('/') print "Patch: %s with %s" % (fn, newfn) res = requests.patch(srv + fob['@id'], auth=(id, pw), data=json.dumps({'submitted_file_name': newfn}), headers={'content-type': 'application/json'}) try: res.raise_for_status() print "Success" except Exception, e: print "Failed %s" % e
def spp(experiment, control, xcor_scores, chrom_sizes, bigbed=False, as_file=None): spp_applet = dxpy.find_one_data_object(classname='applet', name='spp', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) spp_input = { "experiment": experiment, "control": control, "xcor_scores_input": xcor_scores, "bigbed": bigbed, "chrom_sizes": chrom_sizes } if bigbed and as_file: spp_input.update({"as_file": as_file}) return spp_applet.run(spp_input)
def main(pop1, pop2, skip=25, recals=2): # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. psmc20_id = ( "project-B53fX06gYqYbb6B87kgQ0007" ) # Dxpy.find_one_project(zero_ok=True, more_ok=False, name="PSMC_20")['id'] # print psmc20_id, dxpy.WORKSPACE_ID pipeline = dxpy.find_one_data_object( name="PSMC-pipeline", name_mode="regexp", project=psmc20_id, return_handler=True ) files1 = {} for result in dxpy.find_data_objects( name=pop1, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id ): id = result["id"] name = dxpy.describe(id)["name"] files1[name] = id files2 = {} if pop1 != pop2: for result in dxpy.find_data_objects( name=pop2, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id ): id = result["id"] name = dxpy.describe(id)["name"] files2[name] = id if len(files2) == 0 and pop1 != pop2: return {} appjobs = [] if len(files2) == 0: # Single population processing subjobs = [] fn1sort = files1.keys() fn1sort.sort() for i in range(len(fn1sort)): for j in range(i + 1, len(fn1sort)): outroot = pop1 + "." + str(i + 1) + "." + pop1 + "." + str(j + 1) applet_in = { "cons1": dxpy.dxlink(files1[fn1sort[i]]), "cons2": dxpy.dxlink(files1[fn1sort[j]]), "outroot": outroot, "skip": skip, "recalnums": recals, } # appjobs.append(pipeline.run(applet_input=applet_in)) print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[ i ] + " -icons2=/ConsensusSequences/" + fn1sort[j] + " -ioutroot=" + outroot + " -iskip=" + str( skip ) + " -irecalnums=" + str( recals ) + " PSMC-pipeline" elif len(files2) > 0: subjobs = [] fn1sort = files1.keys() fn2sort = files2.keys() fn1sort.sort() fn2sort.sort() for i in range(len(fn1sort)): for j in range(len(fn2sort)): outroot = pop1 + "." + str(i + 1) + "." + pop2 + "." + str(j + 1) applet_in = { "cons1": dxpy.dxlink(files1[fn1sort[i]]), "cons2": dxpy.dxlink(files2[fn2sort[j]]), "outroot": outroot, "skip": skip, "recalnums": recals, } # appjobs.append(pipeline.run(applet_input=applet_in)) print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[ i ] + " -icons2=/ConsensusSequences/" + fn2sort[j] + " -ioutroot=" + outroot + " -iskip=" + str( skip ) + " -irecalnums=" + str( recals ) + " PSMC-pipeline" # for job in app1jobs.keys(): # print job # print app1jobs[job] # print(app1jobs[job].describe()) # print app1jobs[job].get_output_ref("psmcfa") # print app1jobs[job].get_output_ref("psmcfa").describe() # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. # of1 = {} # for j in app1jobs: # of1[j] = app1jobs[j].get_output_ref("psmcfa") # postprocess_job = dxpy.new_dxjob(fn_input={"files1":of1, "files2":[]}, fn_name="postprocess") # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. psmcfaFiles = [] psmcFiles = [] for job in appjobs: psmcfaFiles.append(job.get_output_ref("outfile1")) psmcFiles.append(job.get_output_ref("outfile2")) output = {"psmcfaFiles": psmcfaFiles, "psmcFiles": psmcFiles} return output