def makeInputsBwa(): try: contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id']) reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id']) except StopIteration: raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them") genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True) contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)} print "Running fasta_contigset_importer with", contigset_importer_input job = contigset_importer.run(contigset_importer_input) job.wait_on_done() contig_set = job.describe()["output"]["contig_set"] left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True) right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True) #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True) #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True) reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)} print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input job = reads_importer.run(reads_importer_input) job.wait_on_done() reads = job.describe()["output"]["reads"] return {"reads": [reads] * 3, "reference": contig_set}
def get_fastq_dxfile_objects(self, barcode=None): """ Retrieves all the FASTQ files in project self.dx_project_name as DXFile objects. Args: barcode: `str`. If set, then only FASTQ file properties for FASTQ files having the specified barcode are returned. Returns: `list` of DXFile objects representing FASTQ files. Raises: `dnanexus_utils.FastqNotFound`: No FASTQ files were found. """ bc_reg = re.compile("[ACGT]{6,}-[ACGT]{6,}", re.I) fq_ext_glob = "*{}".format(self.FQEXT) name = fq_ext_glob fastqs = list( dxpy.find_data_objects(project=self.dx_project_id, folder=self.DX_FASTQ_FOLDER, name=name, name_mode="glob")) if not fastqs: # Then look for them in all folders: fastqs = list( dxpy.find_data_objects(project=self.dx_project_id, name=name, name_mode="glob")) if not fastqs: debug_logger.info("No FASTQ files found for run {run} ".format( run=self.dx_project_name)) return [] fastqs = [ dxpy.DXFile(project=x["project"], dxid=x["id"]) for x in fastqs ] if not barcode: return fastqs bc_fastqs = [] # Only those DXFiles that have the barcode of interest. for fq in fastqs: props = fq.get_properties() barcode_val = props.get(self.FQFILE_BARCODE_PROP_NAME) if not barcode_val: # Then try to get it from the file name: hit = bc_reg.search(fq.name) if hit: barcode_val = hit.group() if barcode_val == barcode: bc_fastqs.append(fq) if not bc_fastqs: msg = "No FASTQ files found for run {run} and barcode {barcode}.".format( run=self.dx_project_name, barcode=barcode) debug_logger.error(msg) raise FastqNotFound(msg) return bc_fastqs
def main(): parser = argparse.ArgumentParser(description='Create a manifest file for a particular folder in a project') parser.add_argument('folder', help='a folder in the current DNAnexus project') parser.add_argument('-o', '--output_file', help='Name of the output file', default='manifest.json.bz2') parser.add_argument('-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true', default=False) args = parser.parse_args() project, folder, _ = resolve_existing_path(args.folder) ids = dxpy.find_data_objects(classname='file', first_page_size=1000, state='closed', describe={'fields': {'id': True, 'name': True, 'folder': True, 'parts': True, 'state': True, 'archivalState': True }}, project=project, folder=folder, recurse=args.recursive) manifest = { project: [] } for i,f in enumerate(ids): manifest[project].append(fileID2manifest(f['describe'], project)) if i%1000 == 0 and i != 0: print("Processed {} files".format(i)) # Dedup # Duplicate filenames are converted to filename_fileid dups = [item for item, count in collections.Counter([x['name'] for x in manifest[project]]).items() if count > 1] for x in manifest[project]: if x['name'] in dups: fname, fext = os.path.splitext(x['name']) x['name'] = fname + "_" + x['id'] + fext write_manifest_to_file(args.output_file, manifest) print("Manifest file written to {}".format(args.output_file)) print("Total {} objects".format(len(manifest[project])))
def get_tas(exp_id, default_project, ta_folders): possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path print project, project_name, path for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) print "%s %i possible files" %(exp_id, len(possible_files)) rep1_files = [f for f in possible_files if 'rep1' in f.get('folder')] rep2_files = [f for f in possible_files if 'rep2' in f.get('folder')] if len(rep1_files) != 1: print "Tried to find one rep1 ta, found %d" %(len(rep1_files)) rep1 = None else: rep1 = rep1_files[0].get('project') + ':' + rep1_files[0].get('folder') + '/' + rep1_files[0].get('name') if len(rep2_files) != 1: print "Tried to find one rep2 ta, found %d" %(len(rep2_files)) rep2 = None else: rep2 = rep2_files[0].get('project') + ':' + rep2_files[0].get('folder') + '/' + rep2_files[0].get('name') return rep1, rep2
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() counts = {} n = 0 summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False) while summaries: try: flink = dxpy.dxlink(summaries.next()) n = n+1 except StopIteration: break fd = dxpy.describe(flink) fn = "fastqc/%s" % fd['name'] if not os.path.isfile(fn): print 'Downloading: %s from %s' % (fn, fd['folder']) try: dxpy.download_dxfile(flink, fn) except Exception, e: print "Error %s" % e parse_summary(fn, counts)
def get_repns(exp_id, ta_folders): for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path print project, project_name, path for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') if exp_id in desc.get('folder') and '/bams' in desc.get( 'folder') and desc.get('name').endswith( ('tagAlign', 'tagAlign.gz')): possible_files.append(desc) print "%s %i possible files" % (exp_id, len(possible_files)) folders = [f.get('folder') for f in possible_files] print "%s folders %s" % (exp_id, folders)
def _format_data_file(self, df: DataFile) -> dict: if isinstance(df.localizer, UrlLocalizer): ul = cast(UrlLocalizer, df.localizer) if ul.url.startswith("dx://"): return dxpy.dxlink(*ul.url[5:].split(":")) file_name = df.local_path.name existing_files = list(dxpy.find_data_objects( classname="file", state="closed", name=file_name, project=self._project_id, folder=self._folder, recurse=False )) if not existing_files: # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for # parallelization return dxpy.dxlink(dxpy.upload_local_file( str(df.path), name=file_name, project=self._project_id, folder=self._folder, parents=True, wait_on_close=True )) elif len(existing_files) == 1: return dxpy.dxlink(existing_files[0]["id"], self._project_id) else: raise RuntimeError( f"Multiple files with name {file_name} found in " f"{self._project_id}:{self._folder}" )
def create_dxrecord(self, develop): details = self._get_record_details() self.record_properties = self._set_record_properties() if develop: record_name = 'dev_%s_L%d' % (self.run_name, self.lane_index) self.record_properties['production'] = 'false' #self.record_properties['status'] = 'uploading' details['email'] = '*****@*****.**' else: record_name = '%s_L%d' % (self.run_name, self.lane_index) self.record_properties['production'] = 'true' record_generator = dxpy.find_data_objects(classname = 'record', name = record_name, name_mode = 'exact', project = self.dashboard_project_id, folder = '/') records = list(record_generator) if len(records) > 0: self.record_id = records[0]['id'] else: input_params={ "project": self.dashboard_project_id, "name": record_name, "types": ["SCGPMRun"], "properties": self.record_properties, "details": details } print input_params self.record_id = dxpy.api.record_new(input_params)['id'] dxpy.api.record_close(self.record_id)
def get_all_tas(experiment, default_project, ta_folders): logging.debug( 'get_all_tas: enter with experiment %s default_project %s and ta_folders %s' % (experiment.get('accession'), default_project, ta_folders)) exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, base_path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" base_path = base_folder if not base_path.startswith('/'): base_path = '/' + base_path if not base_path.endswith('/'): base_path = base_path + '/' path = base_path + 'bams/' + exp_id + '/' logging.debug( "get_all_tas: find_data objects in project %s project_name %s path %s" % (project, project_name, path)) for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') logging.debug( "get_all_tas: checking object for match: folder %s name %s" % (desc.get('folder'), desc.get('name'))) if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug( "get_all_tas: exit with possible_files %s" % (possible_files)) return possible_files
def test_full_pipeline(self): if mappingsId == False: input = self.base_input print "Running program with", input try: bwa = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "BWA"}).next()['id']) except: print "BWA not found, please upload it" job = bwa.run(input) job.wait_on_done() print "Bwa output:" print json.dumps(job.describe()["output"]) mappings = job.describe()["output"]["mappings"][0] else: mappings = {"$dnanexus_link":mappingsId} print mappings #print {'mappings':mappings, "output_mode":"EMIT_VARIANTS_ONLY"} #hg19_chr22 reference: job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9ykz7KQ00006B3PXk1b00005"}, "output_mode":"EMIT_VARIANTS_ONLY"}) #yeast #job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9zPp07j000035P6yJ9kQ0006"}, "output_mode":"EMIT_ALL_CONFIDENT_SITES"}) #ce #job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9zV2FBQ0000293088JZ00005"}, "output_mode":"EMIT_ALL_CONFIDENT_SITES"}) job.wait_on_done() print "GATK output:" print json.dumps(job.describe()["output"])
def main(): cmnd = get_args() ## resolve projects project = dxencode.resolve_project(PROJECT_NAME) print 'Project: ' + project.describe()['name'] pid = project.get_id() counts = {} n = 0 summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False) while summaries: try: flink = dxpy.dxlink(summaries.next()) n = n + 1 except StopIteration: break fd = dxpy.describe(flink) fn = "fastqc/%s" % fd['name'] if not os.path.isfile(fn): print 'Downloading: %s from %s' % (fn, fd['folder']) try: dxpy.download_dxfile(flink, fn) except Exception, e: print "Error %s" % e parse_summary(fn, counts)
def get_all_tas(experiment, default_project, ta_folders): exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path print project, project_name, path for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') if exp_id in desc.get('folder') and '/bams' in desc.get( 'folder') and desc.get('name').endswith( ('tagAlign', 'tagAlign.gz')): possible_files.append(desc) return possible_files
def ccle_fetch_existing(info): analysis_id = str(info['analysis_id']) expected_files = ccle_expected_files(info) print '\n\nLooking for existing data for {} in the project, consisting of files: {}'.format(analysis_id,json.dumps(expected_files)) # for each expected file, see if it's already in the project existing = [] for md5 in expected_files: for candidate in dxpy.find_data_objects(project=dxpy.PROJECT_CONTEXT_ID, classname='file', state='closed', name=expected_files[md5], name_mode='exact', properties={'md5': md5}, return_handler=True): deets = candidate.get_details() if 'cghub_metadata' in deets and 'md5' in deets and deets['md5'] == md5: existing.append(candidate) break # if the project already has all of them, we can quit early if len(existing) == len(expected_files): print 'The files are already in the project!' dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID).clone(dxpy.WORKSPACE_ID,objects=[dxfile.get_id() for dxfile in existing]) return existing elif len(existing) > 0: print 'Only some of the files are already in the project!' else: print 'No existing data found in the project.' return None
def determineStepsToDo(pairedEnd, priors, deprecate, projectId, force=False): '''Determine what steps need to be done, base upon prior results.''' willCreate = [] stepsToDo = [] steps = [] if pairedEnd: steps = STEP_ORDER['pe'] else: steps = STEP_ORDER['se'] for step in steps: # Force will include the first step with all its inputs # This should avoid forcing concat if it isn't needed # if force: inputs = STEPS[step]['inputs'].keys() count = 0 for input in inputs: if input in priors: count += 1 if count == len(inputs): stepsToDo += [ step ] if step not in stepsToDo: results = STEPS[step]['results'].keys() for result in results: if result not in priors: #print "- Adding step '"+step+"' because prior '"+result+"' was not found." stepsToDo += [ step ] break # If results are there but inputs are being recreated, then step must be rerun if step not in stepsToDo: inputs = STEPS[step]['inputs'].keys() for inp in inputs: if inp in willCreate: #print "- Adding step '"+step+"' due to prior step dependency." stepsToDo += [ step ] break # Any step that is rerun, will cause prior results to be deprecated # NOTE: It is necessary to remove from 'priors' so succeeding steps are rerun # NOTE: It is also important to move prior results out of target folder to avoid confusion! if step in stepsToDo: results = STEPS[step]['results'].keys() for result in results: willCreate += [ result ] if result in priors: deprecate += [ priors[result] ] del priors[result] # if results are in folder, then duplicate files cause a problem! # So add to 'deprecate' to move or remove before launching # Now make sure the steps can be found, and error out if not. for step in stepsToDo: app = STEPS[step]['app'] dxApp = dxpy.find_data_objects(classname='file', name=app, name_mode='exact', project=projectId, return_handler=False) if dxApp == None: print "ERROR: failure to locate app '"+app+"'!" sys.exit(1) return stepsToDo
def walkfiles(self, pattern=None, canonicalize=False, recurse=True, starts_with=None, limit=None, classname=None): """Iterates over listed files that match an optional pattern. Args: pattern (str): glob pattern to match the filenames against. canonicalize (bool, default False): if True, return canonical paths recurse (bool, default True): if True, look in subfolders of folder as well starts_with (str): Allows for an additional search path to be appended to the resource of the dx path. Note that this resource path is treated as a directory limit (int): Limit the amount of results returned classname (str): Restricting class : One of 'record', 'file', 'gtable, 'applet', 'workflow' Returns: Iter[DXPath]: Iterates over listed files that match an optional pattern. """ proj_id = self.canonical_project proj_name = self.virtual_project kwargs = { 'project': proj_id, 'name': pattern, 'name_mode': 'glob', # the query performance is similar w/wo describe field, # hence no need to customize query based on canonicalize flag 'describe': { 'fields': { 'name': True, 'folder': True } }, 'recurse': recurse, 'classname': classname, 'limit': limit, 'folder': ('/' + (self.resource or '')) + (starts_with or '') } with _wrap_dx_calls(): list_gen = dxpy.find_data_objects(**kwargs) for obj in list_gen: if canonicalize: yield DXCanonicalPath('dx://{}:/{}'.format( obj['project'], obj['id'])) else: yield DXVirtualPath( '{drive}{proj_name}:{folder}/{name}'.format( drive=self.drive, proj_name=proj_name, folder=obj['describe']['folder'].rstrip('/'), name=obj['describe']['name']))
def get_data_matches(text, delim_pos, dxproj, folderpath, classname=None, typespec=None, visibility=None): ''' :param text: String to be tab-completed; still in escaped form :type text: string :param delim_pos: index of last unescaped "/" or ":" in text :type delim_pos: int :param dxproj: DXProject handler to use :type dxproj: DXProject :param folderpath: Unescaped path in which to search for data object matches :type folderpath: string :param classname: Data object class by which to restrict the search (None for no restriction on class) :type classname: string :param visibility: Visibility to constrain the results to; default is "visible" for empty strings, "either" for nonempty :type visibility: string :returns: List of matches :rtype: list of strings Members of the returned list are guaranteed to start with *text* and be in escaped form for consumption by the command-line. ''' #unescaped_text = unescape_completion_name_str(text[delim_pos + 1:]) unescaped_text = text[delim_pos + 1:] if visibility is None: if text != '' and delim_pos != len(text) - 1: visibility = "either" else: visibility = "visible" try: results = list( dxpy.find_data_objects(project=dxproj.get_id(), folder=folderpath, name=unescaped_text + "*", name_mode="glob", recurse=False, visibility=visibility, classname=classname, limit=100, describe=True, typename=typespec)) prefix = '' if text == '' else text[:delim_pos + 1] return [ prefix + escape_name(result['describe']['name']) for result in results ] except: return []
def main(): args = get_args() authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) project = resolve_project(args.project) SRR_files = dxpy.find_data_objects( name="SRR???????_?.fastq.gz", name_mode='glob', classname='file', recurse=True, return_handler=True, folder=args.folder, project=args.project) for srr_dxfile in SRR_files: m = re.search('(SRR.{7})_(\d)', srr_dxfile.name) if m: srr_basename = m.group(1) end_num = m.group(2) else: assert m srr_encfiles = common.encoded_get('/'.join([server,'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename)]), keypair)['@graph'] if not srr_encfiles: logging.error('%s object not found at ENCODE. Skipping.' % (srr_basename)) continue elif len(srr_encfiles) > 1: logging.error('%s multiple matching objects found at ENCODE. Skipping.' % (srr_basename)) continue else: srr_encfile = srr_encfiles[0] # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair) # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair) # biorep_n = replicate.get('biological_replicate_number') all_fastqs = common.encoded_get('/'.join([ server, 'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename) ]), keypair)['@graph'] if not all_fastqs: print("%s: no fastq(s) found. Skipping." % (srr_dxfile.name)) continue if end_num == '1': fastqs = [f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num] elif end_num in ['2', '3']: fastqs = [f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2'] if not fastqs: print("%s: no fastq(s) found for paired_end %s. Skipping" % (srr_basename, end_num)) continue elif len(fastqs) > 1: print("%s: ambiguous matches to %s. Skipping" % (srr_basename, [f.get('accession') for f in fastqs])) continue else: fastq = fastqs[0] newname = '%s.fastq.gz' % (fastq.get('accession')) if args.dry_run: print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname)) else: srr_dxfile.set_properties({'srr_filename': srr_dxfile.name}) srr_dxfile.rename(newname) print('%s renamed to %s' % (srr_dxfile.name, newname))
def process(filename, bucket_url, project, folder, skipvalidate=False): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. logger.debug(filename) test = list( dxpy.find_data_objects(classname='file', folder=folder, project=project, name_mode='exact', name=filename, return_handler=False) ) if not test or len(test) == 0: #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' %(filename))) dx_file = dxpy.upload_local_file(filename, project=project, folder=folder) else: dxpy.download_dxfile(test[0]['id'], filename) dx_file=dxpy.dxfile.DXFile(test[0]['id']) reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq') if skipvalidate: return { "file": dx_file, "report": None, "summary": None, "zip": None } subprocess.check_call(['mkdir', 'output']) logger.info("Run QC") fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output" logger.debug(fqc_command) stdio = subprocess.check_output(shlex.split(fqc_command)) logger.debug(stdio) logger.debug(subprocess.check_output(['ls','-l', 'output'])) subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename]) logger.info("Upload results") subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ]) subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=folder, project=project) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=folder, project=project) zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=folder, project=project) logger.debug(report_dxfile) return { "file": dx_file, "report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile }
def _resolve_global_entity(project_or_job_id, folderpath, entity_name, describe=True, visibility="either"): """ :param project_or_job_id: The project ID to which the entity belongs (then the entity is an existing data object), or the job ID to which the entity belongs (then the entity is a job-based object reference to an object that may not exist yet) :type project_or_job_id: string :param folderpath: Full path to the object (parsed from command line) :type folderpath: string :param entity_name: Name of the object :type entity_name: string :param describe: Input mapping used to describe the job's project if project_or_job_id is a job ID, or True if the input mapping is to be empty :type describe: dict or True :param visibility: The expected visibility of the entity ("either", "hidden", or "visible"); to be used in resolution :type visibility: string :returns: The results obtained from attempting to resolve the entity; the expected format of the return value is described below :rtype: list :raises: ResolutionError if dxpy.find_data_objects throws an error If project_or_job_id is a job ID, then return value will be like: [{"id": ..., "describe": {...}}, ...] Otherwise, the return value will be like: [{"id": ..., "project": ..., "describe": {...}}, ...] Note that if the entity is successfully resolved, then the "describe" key will be in the dictionary if and only if a nonempty describe mapping was provided. TODO: Inspect entity_name and conditionally treat it as a "glob" pattern. """ if is_job_id(project_or_job_id): if describe is True: describe = {} # The following function call will raise a ResolutionError if no results # could be found. # If the call is successful, then the project will be incorporated into the # "describe" mapping of the returned dictionaries. return resolve_job_ref(project_or_job_id, entity_name, describe=describe) else: try: return list(dxpy.find_data_objects(project=project_or_job_id, folder=folderpath, name=entity_name, name_mode='glob', recurse=False, describe=describe, visibility=visibility)) except Exception as details: raise ResolutionError(str(details))
def find_replicates(reps, source_id, project, experiment, test=False): replicates = [] for rep in reps: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='glob', project=source_id, return_handler=False) replicates.extend(dx_rep) if not test: replicates = copy_files(replicates, project.get_id(), "/"+experiment) return replicates
def count_logfiles(self): """Count logfiles in the DNAnexus project (self.id). Logfiles are in an expected location. Returns: logfile_count (int): A count of logfiles""" # Set uploaded runfolder name. Runfolder is renamed upon upload to the DNANexus project # without the first four characters uploaded_runfolder = dxpy.describe(self.id)['name'][4:] # Set logfile location in DNANexus project. This is expected in 'Logfiles/', a subdirectory of the uploaded runfolder logfile_dir = str(Path('/', uploaded_runfolder, 'Logfiles')) logfile_list = dxpy.find_data_objects(project=self.id, folder=logfile_dir, classname='file') logfile_count = len(list(logfile_list)) return logfile_count
def get_dxwdl_applet(): """Build or find the applet to run dxWDL.""" found_applets = list( dxpy.find_data_objects(name=APPLET_NAME, properties={"version": APPLET_VERSION}, classname="applet", state="closed", return_handler=True)) if found_applets: return found_applets[0] else: return build_applet()
def find_replicates(reps, source_id, project, experiment, test=False): replicates = [] for rep in reps: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='glob', project=source_id, return_handler=False) replicates.extend(dx_rep) if not test: replicates = copy_files(replicates, project.get_id(), "/" + experiment) return replicates
def find_file(filePath,project=None,verbose=False,multiple=False, recurse=True): '''Using a DX style file path, find the file.''' proj = project path = filePath fileName = filePath if filePath.find(':') != -1: proj, path = filePath.split(':', 1) if path.rfind('/') != -1: path, fileName = path.rsplit('/', 1) else: fileName = path path = '/' if proj == None: if verbose: print "ERROR: Don't know what project to use for '" + path + "'." return None if proj.find('project-') == 0: projId = proj else: projId = get_project(proj, level='VIEW').get_id() mode = 'exact' if filePath.find('*') or filePath.find('?'): mode = 'glob' fileDicts = list(dxpy.find_data_objects(classname='file', folder=path, name=fileName, recurse=recurse, name_mode=mode, project=projId, return_handler=False)) if fileDicts == None or len(fileDicts) == 0: #print "- Found 0 files from '" + proj + ":" + filePath + "'." if verbose: print "ERROR: Failed to find '" + proj + ":" + filePath + "'." return None elif len(fileDicts) > 1 or multiple: #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'." if not multiple: if verbose: print "ERROR: Found "+str(len(fileDicts))+" files when expecting 1 '" + proj + ":" + filePath + "'." return None else: if verbose: print " Found "+str(len(fileDicts))+" files for '" + proj + ":" + filePath + "'." fids = [] for fileDict in fileDicts: FILES[fileDict['id']] = dxpy.dxlink(fileDict) fids.append( fileDict['id'] ) return fids else: #print "- FOUND '" + proj + ":" + filePath + "'." FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0]) return fileDicts[0]['id']
def get_data_matches(text, delim_pos, dxproj, folderpath, classname=None, typespec=None, visibility=None): """ :param text: String to be tab-completed; still in escaped form :type text: string :param delim_pos: index of last unescaped "/" or ":" in text :type delim_pos: int :param dxproj: DXProject handler to use :type dxproj: DXProject :param folderpath: Unescaped path in which to search for data object matches :type folderpath: string :param classname: Data object class by which to restrict the search (None for no restriction on class) :type classname: string :param visibility: Visibility to constrain the results to; default is "visible" for empty strings, "either" for nonempty :type visibility: string :returns: List of matches :rtype: list of strings Members of the returned list are guaranteed to start with *text* and be in escaped form for consumption by the command-line. """ # unescaped_text = unescape_completion_name_str(text[delim_pos + 1:]) unescaped_text = text[delim_pos + 1 :] if visibility is None: if text != "" and delim_pos != len(text) - 1: visibility = "either" else: visibility = "visible" try: results = list( dxpy.find_data_objects( project=dxproj.get_id(), folder=folderpath, name=unescaped_text + "*", name_mode="glob", recurse=False, visibility=visibility, classname=classname, limit=100, describe=True, typename=typespec, ) ) prefix = "" if text == "" else text[: delim_pos + 1] return [prefix + escape_name(result["describe"]["name"]) for result in results] except: return []
def get_localizer_applet(): """Return a dxpy.DXApplet object for the localizer applet.""" # First try to find an existing applet. found_applets = list( dxpy.find_data_objects(name=APPLET_NAME, properties={"version": APPLET_VERSION}, classname="applet", state="closed", return_handler=True)) if found_applets: return found_applets[0] else: return build_applet()
def main(): args = get_args() if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file.') project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/'+args.experiment): project.new_folder('/'+args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() replicates = [] for rep in args.replicates: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='glob', project=source_id, return_handler=False) replicates.extend(dx_rep) if not args.test: replicates = copy_files(replicates, project.get_id(), "/"+args.experiment) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) paired = args.paired gender = args.gender organism = args.organism #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () spec_name = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates]) wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name, name='ENCODE Bismark DNA-ME pipeline: '+spec_name, description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment, folder='/'+args.experiment, project=project.get_id()) populate_workflow(wf, replicates, args.experiment, paired, gender, organism, project.id)
def get_ta_from_accessions(accessions, default_project, ta_folders): possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug( "Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/', project=project_id, describe=True, recurse=True, name='*tagAlign.gz', name_mode='glob' ): possible_files.append(dxfile.get('describe')) matched_files = \ [f for f in possible_files if all([acc in f['name'] for acc in accessions])] if not matched_files: logging.error( 'Could not find tagAlign with accessions %s' % (accessions)) return None elif len(matched_files) > 1: logging.warning( 'Found multiple tagAligns that matched accessions %s' % (accessions)) logging.warning( 'Matched files %s' % ([(f['folder'], f['name']) for f in matched_files])) logging.warning('Using first one found') return matched_files[0] else: return matched_files[0]
def addLevel(self, node, folder): """ Recurse into folders, and find all IGV-compatible files to be added to registry :param node: an Element, or SubElement to add items to :param folder: a folder to find files within :param debug: boolean. If True, then stop finding files after the first one. :return: nothing. """ assert node is not None assert folder is not None print("Adding {}:{}".format(self.project.name, folder)) subfolders = dxpy.api.project_list_folder(self.project.id, input_params={"folder": folder, "describe": { "fields": {"id": True, "name": True, "class": True}}, "only": "folders", "includeHidden": False}, always_retry=True)["folders"] subfolders = [os.path.basename(subfolder) for subfolder in subfolders] subfolders = list(set(subfolders) - set(("metrics", "inputFastq", "reports"))) for subfolder in subfolders: subnode = SubElement(node, "Category", name=subfolder) subnodepath = str(folder + "/" + subfolder).replace("//", "/") self.addLevel(subnode, subnodepath) dxfiles = list(dxpy.find_data_objects( recurse=False, folder=folder, return_handler=True, project=self.project.get_id()) ) dxfiles.sort(key=lambda x: x.name) for dxfile in dxfiles: if isinstance(dxfile, dxpy.DXFile): # n, ext = os.path.splitext(dxfile.name) if str(dxfile.name).endswith("bam"): self.__addIndexedFile(dxfile, folder, node, ["bai"]) elif str(dxfile.name).endswith("vcf.gz"): self.__addIndexedFile(dxfile, folder, node, ["tbi", "idx"]) elif str(dxfile.name).endswith("bw"): self.__addNonIndexedFile(dxfile, folder, node) elif str(dxfile.name).endswith("bed.gz"): self.__addNonIndexedFile(dxfile, folder, node) elif str(dxfile.name).endswith("seg"): self.__addNonIndexedFile(dxfile, folder, node) elif str(dxfile.name).endswith("cn"): self.__addNonIndexedFile(dxfile, folder, node)
def find_fastq_files(self): ''' Description: Returns a dict of all fastq files in the lane project; key = fastq filename, value = fastq dxid DEV: Instead of returning a generator, I think this should return dxids for each fastq file. Same for interop, and bam files. ''' fastq_dxids = [] fastq_files_generator = dxpy.find_data_objects(classname='file', name='*.fastq.gz', name_mode='glob', project=self.project_id, folder='/') for fastq_dict in self.fastq_files_generator: fastq_dxid = fastq_dict['id'] fastq_dxids.append(fastq_dxid) return fastq_dxids
def main(): parser = argparse.ArgumentParser( description= 'Create a manifest file for a particular folder in a project') parser.add_argument('folder', help='a folder in the current DNAnexus project') parser.add_argument('--outfile', help='Name of the output file', default='manifest.json.bz2') parser.add_argument( '-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true') args = parser.parse_args() project, folder, _ = resolve_existing_path(args.folder) ids = dxpy.find_data_objects(classname='file', first_page_size=1000, describe={ 'id': True, 'name': True, 'folder': True, 'parts': True }, project=project, folder=folder, recurse=args.recursive) manifest = {project: []} for i, f in enumerate(ids): manifest[project].append(fileID2manifest(f['describe'], project)) if i % 1000 == 0 and i != 0: print("Processed {} files".format(i)) with open(args.outfile, "w") as f: f.write(bz2.compress(json.dumps(manifest, indent=2, sort_keys=True))) print("Manifest file written to {}".format(args.outfile)) print("Total {} objects".format(len(manifest[project])))
def find_bam_files(self): ''' DEV: DEPRECATED add functionality to also find BAI files ''' bam_dxids = [] bam_files_generator = dxpy.find_data_objects(classname='file', name='*.bam', name_mode='glob', project=self.project_id, folder='/') bam_files = list(bam_files_generator) if len(bam_files) < 1: print 'Info: No bam files found.' pass else: for bam_dict in bam_files: bam_dxid = bam_dict['id'] bam_dxids.append(bam_dxid) return bam_dxids
def get_repns(exp_id, ta_folders): for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path print project, project_name, path for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) print "%s %i possible files" %(exp_id, len(possible_files)) folders = [f.get('folder') for f in possible_files] print "%s folders %s" %(exp_id, folders)
def get_all_tas(experiment, default_project, ta_folders): exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path print project, project_name, path for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project): desc = dxfile.get('describe') if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) return possible_files
def get_tas(exp_id, default_project, ta_folders): possible_files = [] for base_folder in ta_folders: if ":" in base_folder: project_name, path = base_folder.split(":") project = resolve_project(project_name) project = project.get_id() project_name += ":" else: project = default_project project_name = "" path = base_folder if not path.startswith("/"): path = "/" + path print project, project_name, path for dxfile in dxpy.find_data_objects( classname="file", state="closed", folder=path, describe=True, recurse=True, project=project ): desc = dxfile.get("describe") if ( exp_id in desc.get("folder") and "/bams" in desc.get("folder") and desc.get("name").endswith(("tagAlign", "tagAlign.gz")) ): possible_files.append(desc) print "%s %i possible files" % (exp_id, len(possible_files)) rep1_files = [f for f in possible_files if "rep1" in f.get("folder")] rep2_files = [f for f in possible_files if "rep2" in f.get("folder")] if len(rep1_files) != 1: print "Tried to find one rep1 ta, found %d" % (len(rep1_files)) rep1 = None else: rep1 = rep1_files[0].get("project") + ":" + rep1_files[0].get("folder") + "/" + rep1_files[0].get("name") if len(rep2_files) != 1: print "Tried to find one rep2 ta, found %d" % (len(rep2_files)) rep2 = None else: rep2 = rep2_files[0].get("project") + ":" + rep2_files[0].get("folder") + "/" + rep2_files[0].get("name") return rep1, rep2
def get_data_matches(text, delim_pos, dxproj, folderpath, classname=None, typespec=None): ''' :param text: String to be tab-completed; still in escaped form :type text: string :param delim_pos: index of last unescaped "/" or ":" in text :type delim_pos: int :param dxproj: DXProject handler to use :type dxproj: DXProject :param folderpath: Unescaped path in which to search for data object matches :type folderpath: string :param classname: Data object class by which to restrict the search (None for no restriction on class) :type classname: string :returns: List of matches :rtype: list of strings Members of the returned list are guaranteed to start with *text* and be in escaped form for consumption by the command-line. ''' unescaped_text = unescape_completion_name_str(text[delim_pos + 1:]) try: results = list(dxpy.find_data_objects(project=dxproj.get_id(), folder=folderpath, name=unescaped_text + "*", name_mode="glob", recurse=False, visibility='either' if text != '' and delim_pos != len(text) - 1 else 'visible', classname=classname, limit=100, describe=True, typename=typespec)) names = map(lambda result: result['describe']['name'], results) return filter(startswith(text), map(lambda name: ('' if text == '' else text[:delim_pos + 1]) + escape_completion_name_str(name), names)) except: return []
def find_fastqs(self): """Returns a list of files in the DNAnexus project (self.id) with the fastq.gz extension""" # Search dnanexus for files with the fastq.gz extension. # name_mode='regexp' tells dxpy to look for any occurence of 'fastq.gz' in the filename search_response = dxpy.find_data_objects(project=self.id, classname='file', name='fastq.gz', name_mode='regexp') file_ids = [result['id'] for result in search_response] # Gather a list of uploaded fastq files with the state 'closed', indicating a completed upload. fastq_filenames_unsorted = [] for dx_file in file_ids: file_description = dxpy.describe(dx_file) if file_description['state'] == 'closed': fastq_filenames_unsorted.append(file_description['name']) # Sort fastq filenames for cleaner logfile output fastq_filenames = sorted(fastq_filenames_unsorted) self.logger.debug( f'{self.id} contains {len(fastq_filenames)} "closed" fastq files: {fastq_filenames}' ) return fastq_filenames
def get_vg_bundle(project, applets_folder, existing_dxid=None): if existing_dxid is not None: return dxpy.DXFile(existing_dxid) # determine desired git revision of vg vg_git_revision = subprocess.check_output(["git", "describe", "--long", "--always", "--tags"], cwd=os.path.join(here,"vg")).strip() # is the exe available already? existing = dxpy.find_data_objects(classname="file", typename="vg_bundle", project=project.get_id(), folder="/vg-bundle", properties={"git_revision": vg_git_revision}, return_handler=True) existing = list(existing) if len(existing) > 0: if len(existing) > 1: print("Warning: found multiple vg bundles with git_revision={}, picking one".format(vg_git_revision)) existing = existing[0] print("Using vg bundle {} ({})".format(vg_git_revision,existing.get_id())) return existing # no - build one for this git revision project.new_folder("/vg-bundle", parents=True) print("Building new vg bundle for {}".format(vg_git_revision)) build_cmd = ["dx","build","-f","--destination",project.get_id()+":/vg-bundle/",os.path.join(here,"vg_bundle_builder")] print(" ".join(build_cmd)) build_applet = dxpy.DXApplet(json.loads(subprocess.check_output(build_cmd))["id"]) build_job = build_applet.run({"git_commit": vg_git_revision}, project=project.get_id(), folder="/vg-bundle", name="vg_bundle_builder " + vg_git_revision) print("Launched {} to build vg bundle, waiting...".format(build_job.get_id())) noise = subprocess.Popen(["/bin/bash", "-c", "while true; do sleep 60; date; done"]) try: build_job.wait_on_done() finally: noise.kill() vg_bundle = dxpy.DXFile(build_job.describe()["output"]["vg_bundle"]) print("Using vg bundle {} ({})".format(vg_git_revision,vg_bundle.get_id())) return vg_bundle
def _clone_to_all_regions(region2projid, regions, asset_file_name, folder, url): jobs = [] for region in regions: dest_proj_id = region2projid[region] results = list( dxpy.find_data_objects(classname="file", visibility="hidden", name=asset_file_name, project=dest_proj_id, folder=folder)) file_ids = [p["id"] for p in results] nfiles = len(file_ids) if nfiles == 1: continue if nfiles > 1: print("cleanup in {}, found {} files instead of 0/1".format( dest_proj_id, nfiles)) dxpy.DXProject(dest_proj_id).remove_objects(file_ids) dxjob = _clone_asset_into_region(region, dest_proj_id, asset_file_name, folder, url) jobs.append(dxjob) return jobs
def main(): args = get_args() if len(args.replicates) < 1: sys.exit('Need to have at least 1 replicate file.') project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME) print 'Project: ' + project.describe()['name'] print 'Experiment to analyze: ' + args.experiment if not project_has_folder(project, '/'+args.experiment): project.new_folder('/'+args.experiment) #TODO get all replicate ids from encoded DB from ENCSR (args.experiment) #TODO error out if ENCSR not found, status not complete etc. if args.test: source_id = project.get_id() else: source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id() replicates = [] for rep in args.replicates: dx_rep = dxpy.find_data_objects(classname='file', name=rep, name_mode='exact', project=source_id, return_handler=False) replicates.extend(dx_rep) if not args.test: replicates = copy_files(replicates, project.get_id(), "/"+args.experiment) if not replicates: print "No replicates found in project: " + project.name print "Looking for " + ", ".join(args.replicates) sys.exit(1) inputs = { 'rnd_seed': 12345 } inputs['paired'] = args.paired inputs['gender']= args.gender inputs['organism'] = args.organism inputs['library_id'] = args.library inputs['nthreads'] = args.nthreads #TODO determine paired or gender from ENCSR metadata # Now create a new workflow () inputs['spec_name'] = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates]) title_root = 'dx_long_rna_seq_' name_root = 'ENCODE Long RNA Seq: ' desc = 'The ENCODE RNA Seq pipeline for long RNAs' if args.paired: title_root = title_root + '_paired_end ' name_root = name_root + '(paired-end) ' inputs['stranded'] = True else: title_root = title_root + '_single_end ' name_root = name_root + '(single-end) ' inputs['stranded'] = False if args.export: project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id'] wf = dxpy.new_dxworkflow(title=title_root, name=name_root, description=desc, folder=PUBLIC_FOLDER, project=project_id) else: project_id = project.get_id() wf = dxpy.new_dxworkflow(title=title_root+inputs['spec_name'], name=name_root+inputs['spec_name'], description=desc+' for experiment:' + args.experiment, folder='/'+args.experiment, project=project.get_id()) populate_workflow(wf, replicates, args.experiment, inputs, project.id, args.export)
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug("Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' % (exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' % (len(possible_files))) logging.debug('%s' % ([(f.get('folder'), f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$', f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" % (repn)) if repn in repns: logging.warning( "Ignoring additional rep%d bam, using first found" % (repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" % (repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" % (f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' % (repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" % (exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [ common.encoded_get(server + '/files/%s/' % (f), keypair) for f in get_encffs(possible_files[i].get('name')) ] controlled_by = common.flat( [f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list( set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error( "%s: Could not find controlled_by_ta for accessions %s" % (experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning( '%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' % (experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error( '%s: Could not find controlled_by or resolve possible_controls for rep%d' % (experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' % (controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update({ 'rep%d_ta' % (ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } }) return tas
def resolve_existing_path(path, expected=None, ask_to_resolve=True, expected_classes=None, allow_mult=False, describe={}, all_mult=False, allow_empty_string=True): ''' :param ask_to_resolve: Whether picking may be necessary (if true, a list is returned; if false, only one result is returned) :type ask_to_resolve: boolean :param allow_mult: Whether to allow the user to select multiple results from the same path :type allow_mult: boolean :param describe: Input hash to describe call for the results :type describe: dict :param all_mult: Whether to return all matching results without prompting (only applicable if allow_mult == True) :type all_mult: boolean :returns: A LIST of results when ask_to_resolve is False or allow_mult is True :raises: :exc:`ResolutionError` if the request path was invalid, or a single result was requested and input is not a TTY :param allow_empty_string: If false, a ResolutionError will be raised if *path* is an empty string. Use this when resolving the empty string could result in unexpected behavior. :type allow_empty_string: boolean Returns either a list of results or a single result (depending on how many is expected; if only one, then an interactive picking of a choice will be initiated if input is a tty, or else throw an error). TODO: Always treats the path as a glob pattern. Output is of the form {"id": id, "describe": describe hash} a list of those TODO: Allow arbitrary flags for the describe hash. NOTE: if expected_classes is provided and conflicts with the class of the hash ID, it will return None for all fields. ''' project, folderpath, entity_name = resolve_path( path, expected, allow_empty_string=allow_empty_string) if entity_name is None: # Definitely a folder (or project) # FIXME? Should I check that the folder exists if expected="folder"? return project, folderpath, entity_name elif is_hashid(entity_name): found_valid_class = True if expected_classes is not None: found_valid_class = False for klass in expected_classes: if entity_name.startswith(klass): found_valid_class = True if not found_valid_class: return None, None, None if 'project' not in describe: if project != dxpy.WORKSPACE_ID: describe['project'] = project elif dxpy.WORKSPACE_ID is not None: describe['project'] = dxpy.WORKSPACE_ID try: desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe) except Exception as details: if 'project' in describe: # Now try it without the hint del describe['project'] try: desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe) except Exception as details: raise ResolutionError(str(details)) else: raise ResolutionError(str(details)) result = {"id": entity_name, "describe": desc} if ask_to_resolve and not allow_mult: return project, folderpath, result else: return project, folderpath, [result] elif project is None: raise ResolutionError( 'Could not resolve \"' + path + '\" to a project context. Please either set a default project using dx select or cd, or add a colon (":") after your project ID or name' ) else: msg = 'Object of name ' + unicode( entity_name) + ' could not be resolved in folder ' + unicode( folderpath) + ' of project ID ' + str(project) # Probably an object if is_job_id(project): # The following will raise if no results could be found results = resolve_job_ref(project, entity_name, describe=describe) else: try: results = list( dxpy.find_data_objects(project=project, folder=folderpath, name=entity_name, name_mode='glob', recurse=False, describe=describe, visibility='either')) except BaseException as details: raise ResolutionError(str(details)) if len(results) == 0: # Could not find it as a data object. If anything, it's a # folder. if '/' in entity_name: # Then there's no way it's supposed to be a folder raise ResolutionError(msg) # This is the only possibility left. Leave the # error-checking for later. Note that folderpath does possible_folder = folderpath + '/' + entity_name possible_folder, skip = clean_folder_path(possible_folder, 'folder') return project, possible_folder, None # Caller wants ALL results; just return the whole thing if not ask_to_resolve: return project, None, results if len(results) > 1: if allow_mult and (all_mult or is_glob_pattern(entity_name)): return project, None, results if sys.stdout.isatty(): print 'The given path \"' + path + '\" resolves to the following data objects:' choice = pick(map( lambda result: get_ls_l_desc(result['describe']), results), allow_mult=allow_mult) if allow_mult and choice == '*': return project, None, results else: return project, None, ([results[choice]] if allow_mult else results[choice]) else: raise ResolutionError('The given path \"' + path + '\" resolves to ' + str(len(results)) + ' data objects') elif len(results) == 1: return project, None, ([results[0]] if allow_mult else results[0])
def interactive_help(in_class, param_desc, prompt): is_array = param_desc['class'].startswith("array:") print_param_help(param_desc) print() array_help_str = ', or <ENTER> to finish the list of inputs' if in_class in dx_data_classes: # Class is some sort of data object if dxpy.WORKSPACE_ID is not None: proj_name = None try: proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name'] except: pass if proj_name is not None: print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/')) while True: print('Pick an option to find input data:') try: opt_num = pick(['List and choose from available data in the current project', 'List and choose from available data in the DNAnexus Reference Genomes Files project', 'Select another project to list and choose available data', 'Select an output from a previously-run job (current project only)', 'Return to original prompt (specify an ID or path directly)']) except KeyboardInterrupt: opt_num = 4 if opt_num == 0: query_project = dxpy.WORKSPACE_ID elif opt_num == 1: region = None if dxpy.WORKSPACE_ID: region = dxpy.describe(dxpy.WORKSPACE_ID).get("region") query_project = dxpy.find_one_project(name="Reference Genome Files:*", public=True, billed_to="org-dnanexus_apps", level="VIEW", name_mode="glob", region=region)['id'] elif opt_num == 2: project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True) print('\nProjects to choose from:') query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id'] if opt_num in range(3): result_generator = dxpy.find_data_objects(classname=in_class, typename=param_desc.get('type'), describe=dict(fields=get_ls_l_desc_fields()), project=query_project) print('\nAvailable data:') result_choice = paginate_and_pick(result_generator, (lambda result: get_ls_l_desc(result['describe']))) if result_choice == 'none found': print('No compatible data found') continue elif result_choice == 'none picked': continue else: return [result_choice['project'] + ':' + result_choice['id']] elif opt_num == 3: # Select from previous jobs in current project result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID, describe=True, parent_job="none") print() print('Previously-run jobs to choose from:') result_choice = paginate_and_pick(result_generator, (lambda result: get_find_executions_string(result['describe'], has_children=False, single_result=True)), filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed'])) if result_choice == 'none found': print('No jobs found') continue elif result_choice == 'none picked': continue else: if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None: keys = result_choice['describe']['output'].keys() else: exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet'])) exec_desc = exec_handler.describe() if 'outputSpec' not in exec_desc: # This if block will either continue, return, or raise print('No output spec found for the executable') try: field = input('Output field to use (^C or <ENTER> to cancel): ') if field == '': continue else: return [result_choice['id'] + ':' + field] except KeyboardInterrupt: continue else: keys = exec_desc['outputSpec'].keys() if len(keys) > 1: print('\nOutput fields to choose from:') field_choice = pick(keys) return [result_choice['id'] + ':' + keys[field_choice]] elif len(keys) == 1: print('Using the only output field: ' + keys[0]) return [result_choice['id'] + ':' + keys[0]] else: print('No available output fields') else: print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else ''))) return shlex.split(input(prompt)) else: if in_class == 'boolean': if is_array: print(fill('Enter "true", "false"' + array_help_str)) else: print(fill('Enter "true" or "false"')) elif in_class == 'string' and is_array: print(fill('Enter a nonempty string' + array_help_str)) elif (in_class == 'float' or in_class == 'int') and is_array: print(fill('Enter a number' + array_help_str)) elif in_class == 'hash': print(fill('Enter a quoted JSON hash')) result = input(prompt) if in_class == 'string': return [result] else: return shlex.split(result)
def interactive_help(in_class, param_desc, prompt): is_array = param_desc['class'].startswith("array:") print_param_help(param_desc) print() array_help_str = ', or <ENTER> to finish the list of inputs' if in_class in dx_data_classes: # Class is some sort of data object if dxpy.WORKSPACE_ID is not None: proj_name = None try: proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name'] except: pass if proj_name is not None: print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/')) while True: print('Pick an option to find input data:') try: opt_num = pick(['List and choose from available data in the current project', 'List and choose from available data in the DNAnexus Reference Genomes project', 'Select another project to list and choose available data', 'Select an output from a previously-run job (current project only)', 'Return to original prompt (specify an ID or path directly)']) except KeyboardInterrupt: opt_num = 4 if opt_num == 0: query_project = dxpy.WORKSPACE_ID elif opt_num == 1: query_project = dxpy.find_one_project(name="Reference Genome Files", public=True, billed_to="org-dnanexus", level="VIEW")['id'] elif opt_num == 2: project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True) print('\nProjects to choose from:') query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id'] if opt_num in range(3): result_generator = dxpy.find_data_objects(classname=in_class, typename=param_desc.get('type'), describe=True, project=query_project) print('\nAvailable data:') result_choice = paginate_and_pick(result_generator, (lambda result: get_ls_l_desc(result['describe']))) if result_choice == 'none found': print('No compatible data found') continue elif result_choice == 'none picked': continue else: return [result_choice['project'] + ':' + result_choice['id']] elif opt_num == 3: # Select from previous jobs in current project result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID, describe=True, parent_job="none") print() print('Previously-run jobs to choose from:') result_choice = paginate_and_pick(result_generator, (lambda result: get_find_executions_string(result['describe'], has_children=False, single_result=True)), filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed'])) if result_choice == 'none found': print('No jobs found') continue elif result_choice == 'none picked': continue else: if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None: keys = result_choice['describe']['output'].keys() else: exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet'])) exec_desc = exec_handler.describe() if 'outputSpec' not in exec_desc: # This if block will either continue, return, or raise print('No output spec found for the executable') try: field = input('Output field to use (^C or <ENTER> to cancel): ') if field == '': continue else: return [result_choice['id'] + ':' + field] except KeyboardInterrupt: continue else: keys = exec_desc['outputSpec'].keys() if len(keys) > 1: print('\nOutput fields to choose from:') field_choice = pick(keys) return [result_choice['id'] + ':' + keys[field_choice]] elif len(keys) == 1: print('Using the only output field: ' + keys[0]) return [result_choice['id'] + ':' + keys[0]] else: print('No available output fields') else: print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else ''))) return shlex.split(input(prompt)) else: if in_class == 'boolean': if is_array: print(fill('Enter "true", "false"' + array_help_str)) else: print(fill('Enter "true" or "false"')) elif in_class == 'string' and is_array: print(fill('Enter a nonempty string' + array_help_str)) elif (in_class == 'float' or in_class == 'int') and is_array: print(fill('Enter a number' + array_help_str)) elif in_class == 'hash': print(fill('Enter a quoted JSON hash')) result = input(prompt) if in_class == 'string': return [result] else: return shlex.split(result)
def upload_applet(src_dir, uploaded_resources, check_name_collisions=True, overwrite=False, archive=False, project=None, override_folder=None, override_name=None, dx_toolkit_autodep="stable", dry_run=False, **kwargs): """ Creates a new applet object. :param project: ID of container in which to create the applet. :type project: str, or None to use whatever is specified in dxapp.json :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_folder: str :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_name: str :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency. :type dx_toolkit_autodep: boolean or string """ applet_spec = _get_applet_spec(src_dir) if project is None: dest_project = applet_spec['project'] else: dest_project = project applet_spec['project'] = project if 'name' not in applet_spec: try: applet_spec['name'] = os.path.basename(os.path.abspath(src_dir)) except: raise AppBuilderException("Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)" % (src_dir,)) if override_folder: applet_spec['folder'] = override_folder if 'folder' not in applet_spec: applet_spec['folder'] = '/' if override_name: applet_spec['name'] = override_name if 'dxapi' not in applet_spec: applet_spec['dxapi'] = dxpy.API_VERSION archived_applet = None if check_name_collisions and not dry_run: destination_path = applet_spec['folder'] + ('/' if not applet_spec['folder'].endswith('/') else '') + applet_spec['name'] logger.debug("Checking for existing applet at " + destination_path) for result in dxpy.find_data_objects(classname="applet", name=applet_spec["name"], folder=applet_spec['folder'], project=dest_project, recurse=False): if overwrite: logger.info("Deleting applet %s" % (result['id'])) # TODO: test me dxpy.DXProject(dest_project).remove_objects([result['id']]) elif archive: logger.debug("Archiving applet %s" % (result['id'])) proj = dxpy.DXProject(dest_project) archive_folder = '/.Applet_archive' try: proj.list_folder(archive_folder) except dxpy.DXAPIError: proj.new_folder(archive_folder) proj.move(objects=[result['id']], destination=archive_folder) archived_applet = dxpy.DXApplet(result['id'], project=dest_project) now = datetime.datetime.fromtimestamp(archived_applet.created/1000).ctime() new_name = archived_applet.name + " ({d})".format(d=now) archived_applet.rename(new_name) logger.info("Archived applet %s to %s:\"%s/%s\"" % (result['id'], dest_project, archive_folder, new_name)) else: raise AppBuilderException("An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given" % (destination_path, result['id'])) # ----- # Override various fields from the pristine dxapp.json # Inline Readme.md and Readme.developer.md _inline_documentation_files(applet_spec, src_dir) # Inline the code of the program if "runSpec" in applet_spec and "file" in applet_spec["runSpec"]: # Avoid using runSpec.file for now, it's not fully implemented #code_filename = os.path.join(src_dir, applet_spec["runSpec"]["file"]) #f = dxpy.upload_local_file(code_filename, wait_on_close=True) #applet_spec["runSpec"]["file"] = f.get_id() # Put it into runSpec.code instead with open(os.path.join(src_dir, applet_spec["runSpec"]["file"])) as code_fh: applet_spec["runSpec"]["code"] = code_fh.read() del applet_spec["runSpec"]["file"] # Attach bundled resources to the app if uploaded_resources is not None: applet_spec["runSpec"].setdefault("bundledDepends", []) applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources) # Include the DNAnexus client libraries as an execution dependency, if they are not already # there if dx_toolkit_autodep == "git": dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "git", "url": "git://github.com/dnanexus/dx-toolkit.git", "tag": "master", "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus"} # TODO: reject "beta" and "unstable" eventually elif dx_toolkit_autodep in ("stable", "beta", "unstable"): dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"} elif dx_toolkit_autodep: raise AppBuilderException("dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead" % (dx_toolkit_autodep,)) if dx_toolkit_autodep: applet_spec["runSpec"].setdefault("execDepends", []) exec_depends = applet_spec["runSpec"]["execDepends"] if type(exec_depends) is not list or any(type(dep) is not dict for dep in exec_depends): raise AppBuilderException("Expected runSpec.execDepends to be an array of objects") dx_toolkit_dep_found = any(dep.get('name') in DX_TOOLKIT_PKGS or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends) if not dx_toolkit_dep_found: exec_depends.append(dx_toolkit_dep) if dx_toolkit_autodep == "git": applet_spec.setdefault("access", {}) applet_spec["access"].setdefault("network", []) # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps if "*" not in applet_spec["access"]["network"]: applet_spec["access"]["network"].append("*") merge(applet_spec, kwargs) # ----- # Now actually create the applet if dry_run: print("Would create the following applet:") print(json.dumps(applet_spec, indent=2)) print("*** DRY-RUN-- no applet was created ***") return None, None applet_id = dxpy.api.applet_new(applet_spec)["id"] if "categories" in applet_spec: dxpy.DXApplet(applet_id, project=dest_project).add_tags(applet_spec["categories"]) if archived_applet: archived_applet.set_properties({'replacedWith': archived_applet.get_id()}) return applet_id, applet_spec
def resolve_existing_path(path, expected=None, ask_to_resolve=True, expected_classes=None, allow_mult=False, describe={}, all_mult=False): ''' :param ask_to_resolve: Whether picking may be necessary (if true, a list is returned; if false, only one result is returned) :type ask_to_resolve: boolean :param allow_mult: Whether to allow the user to select multiple results from the same path :type allow_mult: boolean :param describe: Input hash to describe call for the results :type describe: dict :param all_mult: Whether to return all matching results without prompting (only applicable if allow_mult == True) :type all_mult: boolean :returns: A LIST of results when ask_to_resolve is False or allow_mult is True Returns either a list of results or a single result (depending on how many is expected; if only one, then an interactive picking of a choice will be initiated if input is a tty, or else throw an error). TODO: Always treats the path as a glob pattern. Output is of the form {"id": id, "describe": describe hash} a list of those TODO: Allow arbitrary flags for the describe hash. NOTE: if expected_classes is provided and conflicts with the class of the hash ID, it will return None for all fields. ''' project, folderpath, entity_name = resolve_path(path, expected) if entity_name is None: # Definitely a folder (or project) # FIXME? Should I check that the folder exists if expected="folder"? return project, folderpath, entity_name elif is_hashid(entity_name): found_valid_class = True if expected_classes is not None: found_valid_class = False for klass in expected_classes: if entity_name.startswith(klass): found_valid_class = True if not found_valid_class: return None, None, None try: if 'project' not in describe: if project != dxpy.WORKSPACE_ID: describe['project'] = project elif dxpy.WORKSPACE_ID is not None: describe['project'] = dxpy.WORKSPACE_ID desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe) except: if 'project' in describe: # Now try it without the hint del describe['project'] try: desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe) except BaseException as details: raise ResolutionError(str(details)) result = {"id": entity_name, "describe": desc} if ask_to_resolve and not allow_mult: return project, folderpath, result else: return project, folderpath, [result] elif project is None: raise ResolutionError('Error: Could not resolve \"' + path + '\" to a project context. Please either set a default project using dx select or cd, or add a colon (":") after your project ID or name') else: msg = 'Object of name ' + unicode(entity_name) + ' could not be resolved in folder ' + unicode(folderpath) + ' of project ID ' + str(project) # Probably an object if is_job_id(project): # The following will raise if no results could be found results = resolve_job_ref(project, entity_name, describe=describe) else: results = list(dxpy.find_data_objects(project=project, folder=folderpath, name=entity_name, name_mode='glob', recurse=False, describe=describe, visibility='either')) if len(results) == 0: # Could not find it as a data object. If anything, it's a # folder. if '/' in entity_name: # Then there's no way it's supposed to be a folder raise ResolutionError(msg) # This is the only possibility left. Leave the # error-checking for later. Note that folderpath does possible_folder = folderpath + '/' + entity_name possible_folder, skip = clean_folder_path(possible_folder, 'folder') return project, possible_folder, None # Caller wants ALL results; just return the whole thing if not ask_to_resolve: return project, None, results if len(results) > 1: if allow_mult and (all_mult or is_glob_pattern(entity_name)): return project, None, results if sys.stdout.isatty(): print 'The given path \"' + path + '\" resolves to the following data objects:' choice = pick(map(lambda result: get_ls_l_desc(result['describe']), results), allow_mult=allow_mult) if allow_mult and choice == '*': return project, None, results else: return project, None, ([results[choice]] if allow_mult else results[choice]) else: raise ResolutionError('Error: The given path \"' + path + '\" resolves to ' + str(len(results)) + ' data objects') elif len(results) == 1: return project, None, ([results[0]] if allow_mult else results[0])
def process(filename, bucket_url, project, folder, skipvalidate=False): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. logger.debug(filename) test = list( dxpy.find_data_objects(classname='file', folder=folder, project=project, name_mode='exact', name=filename, return_handler=False)) if not test or len(test) == 0: #cp the file from the bucket subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('ls -l %s' % (filename))) dx_file = dxpy.upload_local_file(filename, project=project, folder=folder) else: dxpy.download_dxfile(test[0]['id'], filename) dx_file = dxpy.dxfile.DXFile(test[0]['id']) reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq') if skipvalidate: return {"file": dx_file, "report": None, "summary": None, "zip": None} subprocess.check_call(['mkdir', 'output']) logger.info("Run QC") fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output" logger.debug(fqc_command) stdio = subprocess.check_output(shlex.split(fqc_command)) logger.debug(stdio) logger.debug(subprocess.check_output(['ls', '-l', 'output'])) subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename]) logger.info("Upload results") subprocess.check_call([ 'mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ]) subprocess.check_call([ 'mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=folder, project=project) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=folder, project=project) zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=folder, project=project) logger.debug(report_dxfile) return { "file": dx_file, "report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile }
def process(enc_file_name, bucket_url, proj_id, dx_folder, file_acc, dx_file_name, skipvalidate=False): # Change the following to process whatever input this stage # receives. You may also want to copy and paste the logic to download # and upload files here as well if this stage receives file input # and/or makes file output. print "* "+enc_file_name+" to "+dx_folder test = list( dxpy.find_data_objects(classname='file', folder=dx_folder, project=proj_id, name_mode='exact', name=dx_file_name, properties={ "accession": file_acc }, return_handler=False) ) start = datetime.now() if not test or len(test) == 0: try: #subprocess.check_call(shlex.split('aws s3 cp %s ./%s --quiet' %(bucket_url,dx_file_name)), stderr=subprocess.STDOUT) subprocess.check_call(shlex.split('aws s3 cp %s ./%s' % (bucket_url,dx_file_name)), stderr=subprocess.STDOUT) except: try: print "* s3 cp failed. Reverting to 'wget'" web_url = "https://www.encodeproject.org/files/%s/@@download/%s" % (file_acc,enc_file_name) subprocess.check_call(shlex.split('wget %s -O %s --quiet' % (web_url,dx_file_name) ), stderr=subprocess.STDOUT) except: print "* ERROR: Upload failed" sys.exit(1) # Better to fail than to return empty handed. #return { # "file": None, # "report": None, # "summary": None, # "zip": None #} end = datetime.now() duration = end - start start = end print "* copied to dx local in %.2f seconds" % duration.seconds subprocess.check_call(shlex.split('ls -l %s' %(dx_file_name))) # Make sure folder exists before copying! project = dxpy.DXProject(proj_id) ## should be default dx_file = dxpy.upload_local_file(dx_file_name, project=proj_id, folder=dx_folder, properties={ "accession": file_acc }) end = datetime.now() duration = end - start print "* Uploaded to dx project in %.2f seconds" % duration.seconds else: dxpy.download_dxfile(test[0]['id'], dx_file_name) dx_file=dxpy.dxfile.DXFile(test[0]['id']) end = datetime.now() duration = end - start print "* Downloaded already existing file from in %.2f seconds" % duration.seconds if skipvalidate or not (dx_file_name.endswith(".fastq.gz") or dx_file_name.endswith(".fq.gz")): return { "file": dx_file, "report": None, "summary": None, "zip": None } subprocess.check_call(['mkdir', 'output']) print "* Run QC" fqc_command = "/usr/bin/FastQC/fastqc " + dx_file_name + " -o output" print "* " + fqc_command subprocess.check_output(shlex.split(fqc_command)) subprocess.check_output(['ls','-l', 'output']) reads_basename = dx_file_name.rstrip('.gz').rstrip('.fq').rstrip('.fastq') subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename]) print "* Upload results" subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ]) subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ]) report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=dx_folder, project=proj_id) summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=dx_folder, project=proj_id) zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=dx_folder, project=proj_id) print report_dxfile return { "file": dx_file, "report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile }
def main(folder_name, key_name, assembly, noupload, force, debug): # accessions bams contained within the folder named folder_name/bams # Requires # . directory structure folder_name/bams/ENCSRxxxabc/ ... /basename[.anything].bam # . basename contains one or more ENCFF numbers from which the bam is derived # . bam_filename.flagstat.qc exists # . raw bam flagstat file exists in folder_name/raw_bams/ENCSRxxxabc/ ... /basename[.anything].flagstat.qc # if bam file's tags on DNAnexus already contains and ENCFF number, assume it's already accessioned and skip # create a fully qualified project:filename for submitted_file_name and calculate the file size # if an ENCFF objects exists with the same submitted_file_name, AND it has the same size, skip # **INFER the experiment accession number from the bam's containing folder # calculate the md5 # find the raw bam's .flagstat.qc file and parse # find the bam's .flagstat.qc file and parse # **ASSUME all derived_from ENCFF's appear in the bam's filename # POST file object # Upload to AWS if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if not folder_name.startswith("/"): folder_name = "/" + folder_name if not folder_name.endswith("/"): folder_name += "/" try: project = dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID) project_name = project.describe().get("name") except: logger.error("Failed to resolve proejct") project_name = "" bam_folder = folder_name + "bams/" bams = dxpy.find_data_objects( classname="file", state="closed", name="*.bam", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=bam_folder, recurse=True, return_handler=True, ) authid, authpw, server = processkey(key_name) if not subprocess.call("which md5", shell=True): md5_command = "md5 -q" elif not subprocess.call("which md5sum", shell=True): md5_command = "md5sum" else: logger.error("Cannot find md5 or md5sum command") md5_command = "" file_mapping = [] for bam in bams: already_accessioned = False for tag in bam.tags: m = re.search(r"(ENCFF\d{3}\D{3})|(TSTFF\D{6})", tag) if m: logger.info( "%s appears to contain ENCODE accession number in tag %s ... skipping" % (bam.name, m.group(0)) ) already_accessioned = True break if already_accessioned: continue bam_description = bam.describe() submitted_file_name = project_name + ":" + "/".join([bam.folder, bam.name]) submitted_file_size = bam_description.get("size") url = urlparse.urljoin( server, "search/?type=file&submitted_file_name=%s&format=json&frame=object" % (submitted_file_name) ) r = encoded_get(url, authid, authpw) try: r.raise_for_status() if r.json()["@graph"]: for duplicate_item in r.json()["@graph"]: if duplicate_item.get("status") == "deleted": logger.info("A potential duplicate file was found but its status=deleted ... proceeding") duplicate_found = False else: logger.info("Found potential duplicate: %s" % (duplicate_item.get("accession"))) if submitted_file_size == duplicate_item.get("file_size"): logger.info( "%s %s: File sizes match, assuming duplicate." % (str(submitted_file_size), duplicate_item.get("file_size")) ) duplicate_found = True break else: logger.info( "%s %s: File sizes differ, assuming new file." % (str(submitted_file_size), duplicate_item.get("file_size")) ) duplicate_found = False else: logger.info("No duplicate ... proceeding") duplicate_found = False except: logger.warning("Duplicate accession check failed: %s %s" % (r.status_code, r.reason)) logger.debug(r.text) duplicate_found = False if duplicate_found: if force: logger.info("Duplicate detected, but force=true, so continuing") else: logger.info("Duplicate detected, skipping") continue try: bamqc_fh = dxpy.find_one_data_object( classname="file", name="*.flagstat.qc", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=bam.folder, return_handler=True, ) except: logger.warning("Flagstat file not found ... skipping") continue bamqc_fh = None raw_bams_folder = str(bam.folder).replace("%sbams/" % (folder_name), "%sraw_bams/" % (folder_name), 1) try: raw_bamqc_fh = dxpy.find_one_data_object( classname="file", name="*.flagstat.qc", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=raw_bams_folder, return_handler=True, ) except: logger.warning("Raw flagstat file not found ... skipping") continue raw_bamqc_fh = None try: dup_qc_fh = dxpy.find_one_data_object( classname="file", name="*.dup.qc", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=bam.folder, return_handler=True, ) except: logger.warning("Picard duplicates QC file not found ... skipping") continue dup_qc_fh = None try: xcor_qc_fh = dxpy.find_one_data_object( classname="file", name="*.cc.qc", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=bam.folder, return_handler=True, ) except: logger.warning("Cross-correlation QC file not found ... skipping") continue xcor_qc_fh = None try: pbc_qc_fh = dxpy.find_one_data_object( classname="file", name="*.pbc.qc", name_mode="glob", project=dxpy.PROJECT_CONTEXT_ID, folder=bam.folder, return_handler=True, ) except: logger.warning("PBC QC file not found ... skipping") continue pbc_qc_fh = None experiment_accession = re.match("\S*(ENC\S{8})", bam.folder).group(1) logger.info("Downloading %s" % (bam.name)) dxpy.download_dxfile(bam.get_id(), bam.name) md5_output = subprocess.check_output(" ".join([md5_command, bam.name]), shell=True) calculated_md5 = md5_output.partition(" ")[0].rstrip() encode_object = FILE_OBJ_TEMPLATE encode_object.update({"assembly": assembly}) notes = { "filtered_qc": flagstat_parse(bamqc_fh), "qc": flagstat_parse(raw_bamqc_fh), "dup_qc": dup_parse(dup_qc_fh), "xcor_qc": xcor_parse(xcor_qc_fh), "pbc_qc": pbc_parse(pbc_qc_fh), "dx-id": bam_description.get("id"), "dx-createdBy": bam_description.get("createdBy"), } encode_object.update( { "dataset": experiment_accession, "notes": json.dumps(notes), "submitted_file_name": submitted_file_name, "derived_from": re.findall("(ENCFF\S{6})", bam.name), "file_size": submitted_file_size, "md5sum": calculated_md5, } ) logger.info("Experiment accession: %s" % (experiment_accession)) logger.debug("File metadata: %s" % (encode_object)) url = urlparse.urljoin(server, "files") r = encoded_post(url, authid, authpw, encode_object) try: r.raise_for_status() new_file_object = r.json()["@graph"][0] logger.info("New accession: %s" % (new_file_object.get("accession"))) except: logger.warning("POST file object failed: %s %s" % (r.status_code, r.reason)) logger.debug(r.text) new_file_object = {} if r.status_code == 409: try: # cautiously add a tag with the existing accession number if calculated_md5 in r.json().get("detail"): url = urlparse.urljoin(server, "/search/?type=file&md5sum=%s" % (calculated_md5)) r = encoded_get(url, authid, authpw) r.raise_for_status() accessioned_file = r.json()["@graph"][0] existing_accession = accessioned_file["accession"] bam.add_tags([existing_accession]) logger.info("Already accessioned. Added %s to dxfile tags" % (existing_accession)) except: logger.info("Conflict does not appear to be md5 ... continuing") if noupload: logger.info("--noupload so skipping upload") upload_returncode = -1 else: if new_file_object: creds = new_file_object["upload_credentials"] env = os.environ.copy() env.update( { "AWS_ACCESS_KEY_ID": creds["access_key"], "AWS_SECRET_ACCESS_KEY": creds["secret_key"], "AWS_SECURITY_TOKEN": creds["session_token"], } ) logger.info("Uploading file.") start = time.time() try: subprocess.check_call(["aws", "s3", "cp", bam.name, creds["upload_url"], "--quiet"], env=env) except subprocess.CalledProcessError as e: # The aws command returns a non-zero exit code on error. logger.error("Upload failed with exit code %d" % e.returncode) upload_returncode = e.returncode else: upload_returncode = 0 end = time.time() duration = end - start logger.info("Uploaded in %.2f seconds" % duration) bam.add_tags([new_file_object.get("accession")]) else: upload_returncode = -1 out_string = "\t".join( [ experiment_accession, encode_object.get("submitted_file_name"), new_file_object.get("accession") or "", str(upload_returncode), encode_object.get("notes"), ] ) print out_string file_mapping.append(out_string) os.remove(bam.name) output_log_filename = time.strftime("%m%d%y%H%M") + "-accession_log.csv" out_fh = dxpy.upload_string("\n".join(file_mapping), name=output_log_filename, media_type="text/csv") out_fh.close() output = {"file_mapping": file_mapping, "outfile": dxpy.dxlink(out_fh)} return output
def _clone_asset(record, folder, regions, project_dict): """ This function will attempt to clone the given record into all of the given regions. It will return a dictionary with the regions as keys and the record-ids of the corresponding asset as the values. If an asset is not able to be created in a given region, the value will be set to None. """ # Get the asset record fid = record.get_details()['archiveFileId']['$dnanexus_link'] curr_region = dxpy.describe(record.project)['region'] # Only run once per region regions = set(regions) - set([curr_region]) if len(regions) == 0: # there is nothing to do return app_supported_regions = set( COPY_FILE_APP.describe()['regionalOptions'].keys()) if len(regions - app_supported_regions) > 0: print('Currently no support for the following region(s): [{regions}]'. format(regions=', '.join(regions - app_supported_regions)), file=sys.stderr) sys.exit(1) # Get information about the asset asset_properties = record.get_properties() asset_properties['cloned_from'] = record.get_id() asset_file_name = dxpy.describe(fid)['name'] url = dxpy.DXFile(fid).get_download_url( preauthenticated=True, project=dxpy.DXFile.NO_PROJECT_HINT, duration=URL_DURATION)[0] # setup target folders region2projid = {} for region in regions: dest_proj = util.get_project(project_dict[region]) dest_proj.new_folder(folder, parents=True) region2projid[region] = dest_proj.get_id() print(region2projid) # Fire off a clone process for each region # Wait for the cloning to complete for i in [1, 2, 3]: jobs = _clone_to_all_regions(region2projid, regions, asset_file_name, folder, url) retval = _wait_for_completion(jobs) if retval: break # make records for each file for region in regions: dest_proj_id = region2projid[region] results = list( dxpy.find_data_objects(classname="file", visibility="hidden", name=asset_file_name, project=dest_proj_id, folder=folder)) file_ids = [p["id"] for p in results] if len(file_ids) == 0: raise RuntimeError("Found no files {}:{}/{}".format( dest_proj_id, folder, asset_file_name)) if len(file_ids) > 1: raise RuntimeError( "Found {} files {}:{}/{}, instead of just one".format( len(dxfiles), dest_proj_id, folder, asset_file_name)) dest_asset = dxpy.new_dxrecord( name=record.name, types=['AssetBundle'], details={'archiveFileId': dxpy.dxlink(file_ids[0])}, properties=record.get_properties(), project=dest_proj_id, folder=folder, close=True)
def get_tas(experiment, server, keypair, default_project, ta_folders): # tas = { # 'rep1_ta': { # 'file_id': "", # 'project_id': "", # 'folder': "", # 'name': "", # 'paired_end': False, # 'control_path': "", # 'enc_repn': 0 #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR... #.from this list infer repns from the paths ../bams/ENCSR.../repn* #.from this list infer the ENCFF's for the fastqs that were used #for each repn go to the experiment and find all the fastqs for that rep #if there are different fastq's in the experiment, or different reps, warn #for each fastq found in the TA filename, find its controlled_by #if any have controlled_by, all must have controlled_by else error # gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error #else get possible_controls and try to match the repn, else pick one (rememeber it) # gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error exp_id = experiment['accession'] possible_files = [] for base_folder in ta_folders: if ':' in base_folder: project_name, path = base_folder.split(':') project = resolve_project(project_name) project_id = project.get_id() project_name += ":" else: project_id = default_project project_name = "" path = base_folder if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path += '/' logging.debug( "Looking for TA's in %s %s %s" % (project_id, project_name, path)) for dxfile in dxpy.find_data_objects( classname='file', state='closed', folder=path + 'bams/%s/' %(exp_id), project=project_id, describe=True, recurse=True, ): desc = dxfile.get('describe') if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')): possible_files.append(desc) logging.debug('Found %s possible files' %(len(possible_files))) logging.debug('%s' %([(f.get('folder'),f.get('name')) for f in possible_files])) repns = [] files_to_ignore = [] for f in possible_files: m = re.search('/rep(\d+)$',f['folder']) if m: repn = int(m.group(1)) logging.debug("Matched rep%d" %(repn)) if repn in repns: logging.warning("Ignoring additional rep%d bam, using first found" %(repn)) files_to_ignore.append(f) else: logging.debug("First time finding rep%d" %(repn)) repns.append(repn) else: logging.error("Cannot parse rep number from %s" %(f['folder'])) return None for f in files_to_ignore: possible_files.remove(f) logging.debug('Discovered repns %s' %(repns)) if len(repns) != 2: logging.error("Required to have exactly 2 reps for %s. Found %d: %s" %(exp_id, len(repns), repns)) return None tas = {} used_controls = [] for i, repn in enumerate(repns): encode_files = [common.encoded_get(server+'/files/%s/' %(f), keypair) for f in get_encffs(possible_files[i].get('name'))] controlled_by = common.flat([f.get('controlled_by') for f in encode_files]) if any(controlled_by): controlled_by_accessions = list(set([uri.split('/')[2] for uri in controlled_by if uri])) controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders) if controlled_by_ta: controlled_by_ta_name = controlled_by_ta.get('name') controlled_by_ta_id = controlled_by_ta.get('id') else: logging.error("%s: Could not find controlled_by_ta for accessions %s" %(experiment.get('accession'), controlled_by_accessions)) controlled_by_ta_name = None controlled_by_ta_id = None else: #evaluate possible controls controlled_by_accessions = None possible_controls = experiment.get('possible_controls') logging.warning('%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' %(experiment.get('accession'), repn, possible_controls)) if not possible_controls or not any(possible_controls): logging.error('%s: Could not find controlled_by or resolve possible_controls for rep%d' %(experiment.get('accession'), repn)) controlled_by_ta_name = None controlled_by_ta_id = None else: control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls) controlled_by_ta_name = control_ta.get('name') controlled_by_ta_id = control_ta.get('id') if controlled_by_ta_id and controlled_by_ta_id in used_controls: logging.warning('%s: Using same control %s for multiple reps' %(controlled_by_ta_id, controlled_by_ta_name)) used_controls.append(controlled_by_ta_id) #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same. #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first. if repn == min(repns): ta_index = 1 else: ta_index = 2 tas.update( {'rep%d_ta' %(ta_index): { 'file_id': possible_files[i].get('id'), 'project_id': possible_files[i].get('project'), 'folder': possible_files[i].get('folder'), 'file_name': possible_files[i].get('name'), 'enc_fqs': get_encffs(possible_files[i].get('name')), 'controlled_by': controlled_by_accessions, 'controlled_by_name': controlled_by_ta_name, 'control_id': controlled_by_ta_id, 'enc_repn': repn, 'paired_end': is_paired_end(possible_files[i]) } } ) return tas
def main(pop1, pop2, skip=25, recals=2): # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. psmc20_id = ( "project-B53fX06gYqYbb6B87kgQ0007" ) # Dxpy.find_one_project(zero_ok=True, more_ok=False, name="PSMC_20")['id'] # print psmc20_id, dxpy.WORKSPACE_ID pipeline = dxpy.find_one_data_object( name="PSMC-pipeline", name_mode="regexp", project=psmc20_id, return_handler=True ) files1 = {} for result in dxpy.find_data_objects( name=pop1, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id ): id = result["id"] name = dxpy.describe(id)["name"] files1[name] = id files2 = {} if pop1 != pop2: for result in dxpy.find_data_objects( name=pop2, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id ): id = result["id"] name = dxpy.describe(id)["name"] files2[name] = id if len(files2) == 0 and pop1 != pop2: return {} appjobs = [] if len(files2) == 0: # Single population processing subjobs = [] fn1sort = files1.keys() fn1sort.sort() for i in range(len(fn1sort)): for j in range(i + 1, len(fn1sort)): outroot = pop1 + "." + str(i + 1) + "." + pop1 + "." + str(j + 1) applet_in = { "cons1": dxpy.dxlink(files1[fn1sort[i]]), "cons2": dxpy.dxlink(files1[fn1sort[j]]), "outroot": outroot, "skip": skip, "recalnums": recals, } # appjobs.append(pipeline.run(applet_input=applet_in)) print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[ i ] + " -icons2=/ConsensusSequences/" + fn1sort[j] + " -ioutroot=" + outroot + " -iskip=" + str( skip ) + " -irecalnums=" + str( recals ) + " PSMC-pipeline" elif len(files2) > 0: subjobs = [] fn1sort = files1.keys() fn2sort = files2.keys() fn1sort.sort() fn2sort.sort() for i in range(len(fn1sort)): for j in range(len(fn2sort)): outroot = pop1 + "." + str(i + 1) + "." + pop2 + "." + str(j + 1) applet_in = { "cons1": dxpy.dxlink(files1[fn1sort[i]]), "cons2": dxpy.dxlink(files2[fn2sort[j]]), "outroot": outroot, "skip": skip, "recalnums": recals, } # appjobs.append(pipeline.run(applet_input=applet_in)) print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[ i ] + " -icons2=/ConsensusSequences/" + fn2sort[j] + " -ioutroot=" + outroot + " -iskip=" + str( skip ) + " -irecalnums=" + str( recals ) + " PSMC-pipeline" # for job in app1jobs.keys(): # print job # print app1jobs[job] # print(app1jobs[job].describe()) # print app1jobs[job].get_output_ref("psmcfa") # print app1jobs[job].get_output_ref("psmcfa").describe() # The following line creates the job that will perform the # "postprocess" step of your app. We've given it an input field # that is a list of job-based object references created from the # "process" jobs we just created. Assuming those jobs have an # output field called "output", these values will be passed to the # "postprocess" job. Because these values are not ready until the # "process" jobs finish, the "postprocess" job WILL NOT RUN until # all job-based object references have been resolved (i.e. the # jobs they reference have finished running). # # If you do not plan to have the "process" jobs create output that # the "postprocess" job will require, then you can explicitly list # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either dxpy handlers or string IDs in the list). We've # included this parameter in the line below as well for # completeness, though it is unnecessary if you are providing # job-based object references in the input that refer to the same # set of jobs. # of1 = {} # for j in app1jobs: # of1[j] = app1jobs[j].get_output_ref("psmcfa") # postprocess_job = dxpy.new_dxjob(fn_input={"files1":of1, "files2":[]}, fn_name="postprocess") # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field in # the postprocess function is called "answer", you can pass that # on here as follows: # # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as gtables) which will be closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. psmcfaFiles = [] psmcFiles = [] for job in appjobs: psmcfaFiles.append(job.get_output_ref("outfile1")) psmcFiles.append(job.get_output_ref("outfile2")) output = {"psmcfaFiles": psmcfaFiles, "psmcFiles": psmcFiles} return output
def build_and_upload_locally( src_dir, mode, overwrite=False, archive=False, publish=False, destination_override=None, version_override=None, bill_to_override=None, use_temp_build_project=True, do_parallel_build=True, do_version_autonumbering=True, do_try_update=True, dx_toolkit_autodep="stable", do_check_syntax=True, dry_run=False, return_object_dump=False, confirm=True, ensure_upload=False, region=None, **kwargs ): dxpy.app_builder.build(src_dir, parallel_build=do_parallel_build) app_json = _parse_app_spec(src_dir) _verify_app_source_dir(src_dir, mode, enforce=do_check_syntax) if mode == "app" and not dry_run: _verify_app_writable(app_json["name"]) working_project = None using_temp_project = False override_folder = None override_applet_name = None if mode == "applet" and destination_override: working_project, override_folder, override_applet_name = parse_destination(destination_override) elif mode == "app" and use_temp_build_project and not dry_run: # Create a temp project try: if region: working_project = dxpy.api.project_new( {"name": "Temporary build project for dx-build-app", "region": region} )["id"] else: working_project = dxpy.api.project_new({"name": "Temporary build project for dx-build-app"})["id"] except: err_exit() logger.debug("Created temporary project %s to build in" % (working_project,)) using_temp_project = True try: if mode == "applet" and working_project is None and dxpy.WORKSPACE_ID is None: parser.error( "Can't create an applet without specifying a destination project; please use the -d/--destination flag to explicitly specify a project" ) if "buildOptions" in app_json: if app_json["buildOptions"].get("dx_toolkit_autodep") == False: dx_toolkit_autodep = False # Perform check for existence of applet with same name in # destination for case in which neither "-f" nor "-a" is # given BEFORE uploading resources. if mode == "applet" and not overwrite and not archive: try: dest_name = override_applet_name or app_json.get("name") or os.path.basename(os.path.abspath(src_dir)) except: raise dxpy.app_builder.AppBuilderException( "Could not determine applet name from specification + " "(dxapp.json) or from working directory (%r)" % (src_dir,) ) dest_folder = override_folder or app_json.get("folder") or "/" if not dest_folder.endswith("/"): dest_folder = dest_folder + "/" dest_project = working_project if working_project else dxpy.WORKSPACE_ID for result in dxpy.find_data_objects( classname="applet", name=dest_name, folder=dest_folder, project=dest_project, recurse=False ): dest_path = dest_folder + dest_name msg = "An applet already exists at {} (id {}) and neither".format(dest_path, result["id"]) msg += " -f/--overwrite nor -a/--archive were given." raise dxpy.app_builder.AppBuilderException(msg) bundled_resources = ( dxpy.app_builder.upload_resources( src_dir, project=working_project, folder=override_folder, ensure_upload=ensure_upload ) if not dry_run else [] ) try: # TODO: the "auto" setting is vestigial and should be removed. if dx_toolkit_autodep == "auto": dx_toolkit_autodep = "stable" applet_id, applet_spec = dxpy.app_builder.upload_applet( src_dir, bundled_resources, check_name_collisions=(mode == "applet"), overwrite=overwrite and mode == "applet", archive=archive and mode == "applet", project=working_project, override_folder=override_folder, override_name=override_applet_name, dx_toolkit_autodep=dx_toolkit_autodep, dry_run=dry_run, **kwargs ) except: # Avoid leaking any bundled_resources files we may have # created, if applet creation fails. Note that if # using_temp_project, the entire project gets destroyed at # the end, so we don't bother. if not using_temp_project: objects_to_delete = [ dxpy.get_dxlink_ids(bundled_resource_obj["id"])[0] for bundled_resource_obj in bundled_resources ] if objects_to_delete: dxpy.api.project_remove_objects( dxpy.app_builder.get_destination_project(src_dir, project=working_project), input_params={"objects": objects_to_delete}, ) raise if dry_run: return applet_name = applet_spec["name"] logger.debug("Created applet " + applet_id + " successfully") if mode == "app": if "version" not in app_json: parser.error('dxapp.json contains no "version" field, but it is required to build an app') version = app_json["version"] try_versions = [version_override or version] if not version_override and do_version_autonumbering: try_versions.append(version + _get_version_suffix(src_dir, version)) app_id = dxpy.app_builder.create_app( applet_id, applet_name, src_dir, publish=publish, set_default=publish, billTo=bill_to_override, try_versions=try_versions, try_update=do_try_update, confirm=confirm, ) app_describe = dxpy.api.app_describe(app_id) if publish: print( "Uploaded and published app %s/%s (%s) successfully" % (app_describe["name"], app_describe["version"], app_id), file=sys.stderr, ) else: print( "Uploaded app %s/%s (%s) successfully" % (app_describe["name"], app_describe["version"], app_id), file=sys.stderr, ) print("You can publish this app with:", file=sys.stderr) print( ' dx api app-%s/%s publish "{\\"makeDefault\\": true}"' % (app_describe["name"], app_describe["version"]), file=sys.stderr, ) return app_describe if return_object_dump else {"id": app_id} elif mode == "applet": return dxpy.api.applet_describe(applet_id) if return_object_dump else {"id": applet_id} else: raise dxpy.app_builder.AppBuilderException("Unrecognized mode %r" % (mode,)) finally: # Clean up after ourselves. if using_temp_project: dxpy.api.project_destroy(working_project)
def upload_applet(src_dir, uploaded_resources, check_name_collisions=True, overwrite=False, archive=False, project=None, override_folder=None, override_name=None, dx_toolkit_autodep="stable", dry_run=False, **kwargs): """ Creates a new applet object. :param project: ID of container in which to create the applet. :type project: str, or None to use whatever is specified in dxapp.json :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_folder: str :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_name: str :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency. :type dx_toolkit_autodep: boolean or string """ applet_spec = _get_applet_spec(src_dir) if project is None: dest_project = applet_spec['project'] else: dest_project = project applet_spec['project'] = project if 'name' not in applet_spec: try: applet_spec['name'] = os.path.basename(os.path.abspath(src_dir)) except: raise AppBuilderException( "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)" % (src_dir, )) if override_folder: applet_spec['folder'] = override_folder if 'folder' not in applet_spec: applet_spec['folder'] = '/' if override_name: applet_spec['name'] = override_name if 'dxapi' not in applet_spec: applet_spec['dxapi'] = dxpy.API_VERSION applet_to_overwrite = None archived_applet = None if check_name_collisions and not dry_run: destination_path = applet_spec['folder'] + ( '/' if not applet_spec['folder'].endswith('/') else '') + applet_spec['name'] logger.debug("Checking for existing applet at " + destination_path) for result in dxpy.find_data_objects(classname="applet", name=applet_spec["name"], folder=applet_spec['folder'], project=dest_project, recurse=False): if overwrite: # Don't remove the old applet until after the new one # has been created. This avoids a race condition where # we remove the old applet, but that causes garbage # collection of the bundled resources that will be # shared with the new applet applet_to_overwrite = result['id'] elif archive: logger.debug("Archiving applet %s" % (result['id'])) proj = dxpy.DXProject(dest_project) archive_folder = '/.Applet_archive' try: proj.list_folder(archive_folder) except dxpy.DXAPIError: proj.new_folder(archive_folder) proj.move(objects=[result['id']], destination=archive_folder) archived_applet = dxpy.DXApplet(result['id'], project=dest_project) now = datetime.datetime.fromtimestamp(archived_applet.created / 1000).ctime() new_name = archived_applet.name + " ({d})".format(d=now) archived_applet.rename(new_name) logger.info( "Archived applet %s to %s:\"%s/%s\"" % (result['id'], dest_project, archive_folder, new_name)) else: raise AppBuilderException( "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given" % (destination_path, result['id'])) # ----- # Override various fields from the pristine dxapp.json # Inline Readme.md and Readme.developer.md _inline_documentation_files(applet_spec, src_dir) # Inline the code of the program if "runSpec" in applet_spec and "file" in applet_spec["runSpec"]: # Avoid using runSpec.file for now, it's not fully implemented #code_filename = os.path.join(src_dir, applet_spec["runSpec"]["file"]) #f = dxpy.upload_local_file(code_filename, wait_on_close=True) #applet_spec["runSpec"]["file"] = f.get_id() # Put it into runSpec.code instead with open(os.path.join(src_dir, applet_spec["runSpec"]["file"])) as code_fh: applet_spec["runSpec"]["code"] = code_fh.read() del applet_spec["runSpec"]["file"] # Attach bundled resources to the app if uploaded_resources is not None: applet_spec["runSpec"].setdefault("bundledDepends", []) applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources) # Include the DNAnexus client libraries as an execution dependency, if they are not already # there if dx_toolkit_autodep == "git": dx_toolkit_dep = { "name": "dx-toolkit", "package_manager": "git", "url": "git://github.com/dnanexus/dx-toolkit.git", "tag": "master", "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus" } # TODO: reject "beta" and "unstable" eventually elif dx_toolkit_autodep in ("stable", "beta", "unstable"): dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"} elif dx_toolkit_autodep: raise AppBuilderException( "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead" % (dx_toolkit_autodep, )) if dx_toolkit_autodep: applet_spec["runSpec"].setdefault("execDepends", []) exec_depends = applet_spec["runSpec"]["execDepends"] if type(exec_depends) is not list or any( type(dep) is not dict for dep in exec_depends): raise AppBuilderException( "Expected runSpec.execDepends to be an array of objects") dx_toolkit_dep_found = any( dep.get('name') in DX_TOOLKIT_PKGS or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends) if not dx_toolkit_dep_found: exec_depends.append(dx_toolkit_dep) if dx_toolkit_autodep == "git": applet_spec.setdefault("access", {}) applet_spec["access"].setdefault("network", []) # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps if "*" not in applet_spec["access"]["network"]: applet_spec["access"]["network"].append("*") merge(applet_spec, kwargs) # ----- # Now actually create the applet if dry_run: print("Would create the following applet:") print(json.dumps(applet_spec, indent=2)) print("*** DRY-RUN-- no applet was created ***") return None, None if applet_spec.get("categories", []): if "tags" not in applet_spec: applet_spec["tags"] = [] applet_spec["tags"] = list( set(applet_spec["tags"]) | set(applet_spec["categories"])) applet_id = dxpy.api.applet_new(applet_spec)["id"] if archived_applet: archived_applet.set_properties({'replacedWith': applet_id}) # Now it is permissible to delete the old applet, if any if applet_to_overwrite: logger.info("Deleting applet %s" % (applet_to_overwrite, )) # TODO: test me dxpy.DXProject(dest_project).remove_objects([applet_to_overwrite]) return applet_id, applet_spec
return project['id'] if __name__ == '__main__': args = get_args() if args.sort_filter_and_remove_dups: args.duplicates_removed = True applets_project_id = resolve_applets_project() project = get_project(args.project_name) print 'Project: ' + project.describe()['name'] if project_has_controls_and_replicates_folders(project): replicates = dxpy.find_data_objects(classname='file', name='*.bam', name_mode='glob', project=project.get_id(), folder=REPLICATES_FOLDER, return_handler=False) replicates = [dxpy.dxlink(r) for r in replicates] controls = dxpy.find_data_objects(classname='file', name='*.bam', name_mode='glob', project=project.get_id(), folder=CONTROLS_FOLDER, return_handler=False) controls = [dxpy.dxlink(c) for c in controls] else: if (len(args.replicates) < 1) or (len(args.controls) < 1): sys.exit( 'Need to have at least 1 replicate file and 1 control file.') project.new_folder(REPLICATES_FOLDER, True)
def upload_applet(src_dir, uploaded_resources, check_name_collisions=True, overwrite=False, archive=False, project=None, override_folder=None, override_name=None, dx_toolkit_autodep="stable", dry_run=False, **kwargs): """ Creates a new applet object. :param project: ID of container in which to create the applet. :type project: str, or None to use whatever is specified in dxapp.json :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_folder: str :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_name: str :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency. :type dx_toolkit_autodep: boolean or string """ applet_spec = _get_applet_spec(src_dir) if project is None: dest_project = applet_spec['project'] else: dest_project = project applet_spec['project'] = project if 'name' not in applet_spec: try: applet_spec['name'] = os.path.basename(os.path.abspath(src_dir)) except: raise AppBuilderException( "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)" % (src_dir, )) if override_folder: applet_spec['folder'] = override_folder if 'folder' not in applet_spec: applet_spec['folder'] = '/' if override_name: applet_spec['name'] = override_name if 'dxapi' not in applet_spec: applet_spec['dxapi'] = dxpy.API_VERSION applets_to_overwrite = [] archived_applet = None if check_name_collisions and not dry_run: destination_path = applet_spec['folder'] + ( '/' if not applet_spec['folder'].endswith('/') else '') + applet_spec['name'] logger.debug("Checking for existing applet at " + destination_path) for result in dxpy.find_data_objects(classname="applet", name=applet_spec["name"], folder=applet_spec['folder'], project=dest_project, recurse=False): if overwrite: # Don't remove the old applet until after the new one # has been created. This avoids a race condition where # we remove the old applet, but that causes garbage # collection of the bundled resources that will be # shared with the new applet applets_to_overwrite.append(result['id']) elif archive: logger.debug("Archiving applet %s" % (result['id'])) proj = dxpy.DXProject(dest_project) archive_folder = '/.Applet_archive' try: proj.list_folder(archive_folder) except dxpy.DXAPIError: proj.new_folder(archive_folder) proj.move(objects=[result['id']], destination=archive_folder) archived_applet = dxpy.DXApplet(result['id'], project=dest_project) now = datetime.datetime.fromtimestamp(archived_applet.created / 1000).ctime() new_name = archived_applet.name + " ({d})".format(d=now) archived_applet.rename(new_name) logger.info( "Archived applet %s to %s:\"%s/%s\"" % (result['id'], dest_project, archive_folder, new_name)) else: raise AppBuilderException( "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given" % (destination_path, result['id'])) # ----- # Override various fields from the pristine dxapp.json # Carry region-specific values from regionalOptions into the main # runSpec applet_spec["runSpec"].setdefault("bundledDepends", []) applet_spec["runSpec"].setdefault("assetDepends", []) if not dry_run: region = dxpy.api.project_describe( dest_project, input_params={"fields": { "region": True }})["region"] # if regionalOptions contain at least one region, they must include # the region of the target project if len(applet_spec.get('regionalOptions', {})) != 0 and region not in applet_spec.get( 'regionalOptions', {}): err_mesg = "destination project is in region {} but \"regionalOptions\" do not contain this region. ".format( region) err_mesg += "Please, update your \"regionalOptions\" specification" raise AppBuilderException(err_mesg) regional_options = applet_spec.get('regionalOptions', {}).get(region, {}) # We checked earlier that if region-specific values for the # fields below are given, the same fields are not also specified # in the top-level runSpec. So the operations below should not # result in any user-supplied settings being clobbered. if 'systemRequirements' in regional_options: applet_spec["runSpec"]["systemRequirements"] = regional_options[ 'systemRequirements'] if 'bundledDepends' in regional_options: applet_spec["runSpec"]["bundledDepends"].extend( regional_options["bundledDepends"]) if 'assetDepends' in regional_options: applet_spec["runSpec"]["assetDepends"].extend( regional_options["assetDepends"]) # Inline Readme.md and Readme.developer.md dxpy.executable_builder.inline_documentation_files(applet_spec, src_dir) # Inline the code of the program if "file" in applet_spec["runSpec"]: # Put it into runSpec.code instead with open(os.path.join(src_dir, applet_spec["runSpec"]["file"])) as code_fh: applet_spec["runSpec"]["code"] = code_fh.read() del applet_spec["runSpec"]["file"] # If this is applet requires a cluster, inline any bootstrapScript code that may be provided. # bootstrapScript is an *optional* clusterSpec parameter. # NOTE: assumes bootstrapScript is always provided as a filename if "systemRequirements" in applet_spec["runSpec"]: sys_reqs = applet_spec["runSpec"]["systemRequirements"] for entry_point in sys_reqs: try: bootstrap_script = os.path.join( src_dir, sys_reqs[entry_point]["clusterSpec"]["bootstrapScript"]) with open(bootstrap_script) as code_fh: sys_reqs[entry_point]["clusterSpec"][ "bootstrapScript"] = code_fh.read() except KeyError: # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec" continue except IOError: raise AppBuilderException( "The clusterSpec \"bootstrapScript\" could not be read.") # Attach bundled resources to the app if uploaded_resources is not None: applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources) # Validate and process assetDepends asset_depends = applet_spec["runSpec"]["assetDepends"] if type(asset_depends) is not list or any( type(dep) is not dict for dep in asset_depends): raise AppBuilderException( "Expected runSpec.assetDepends to be an array of objects") for asset in asset_depends: asset_project = asset.get("project", None) asset_folder = asset.get("folder", '/') asset_stages = asset.get("stages", None) if "id" in asset: asset_record = dxpy.DXRecord(asset["id"]).describe( fields={'details'}, default_fields=True) elif "name" in asset and asset_project is not None and "version" in asset: try: asset_record = dxpy.find_one_data_object( zero_ok=True, classname="record", typename="AssetBundle", name=asset["name"], properties=dict(version=asset["version"]), project=asset_project, folder=asset_folder, recurse=False, describe={ "defaultFields": True, "fields": { "details": True } }, state="closed", more_ok=False) except dxpy.exceptions.DXSearchError: msg = "Found more than one asset record that matches: name={0}, folder={1} in project={2}." raise AppBuilderException( msg.format(asset["name"], asset_folder, asset_project)) else: raise AppBuilderException( "Each runSpec.assetDepends element must have either {'id'} or " "{'name', 'project' and 'version'} field(s).") if asset_record: if "id" in asset: asset_details = asset_record["details"] else: asset_details = asset_record["describe"]["details"] if "archiveFileId" in asset_details: archive_file_id = asset_details["archiveFileId"] else: raise AppBuilderException( "The required field 'archiveFileId' was not found in " "the details of the asset bundle %s " % asset_record["id"]) archive_file_name = dxpy.DXFile(archive_file_id).describe()["name"] bundle_depends = {"name": archive_file_name, "id": archive_file_id} if asset_stages: bundle_depends["stages"] = asset_stages applet_spec["runSpec"]["bundledDepends"].append(bundle_depends) # If the file is not found in the applet destination project, clone it from the asset project if (not dry_run and dxpy.DXRecord(dxid=asset_record["id"], project=dest_project).describe()["project"] != dest_project): dxpy.DXRecord( asset_record["id"], project=asset_record["project"]).clone(dest_project) else: raise AppBuilderException( "No asset bundle was found that matched the specification %s" % (json.dumps(asset))) # Include the DNAnexus client libraries as an execution dependency, if they are not already # there if dx_toolkit_autodep == "git": dx_toolkit_dep = { "name": "dx-toolkit", "package_manager": "git", "url": "git://github.com/dnanexus/dx-toolkit.git", "tag": "master", "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus" } elif dx_toolkit_autodep == "stable": dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"} elif dx_toolkit_autodep: raise AppBuilderException( "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead" % (dx_toolkit_autodep, )) if dx_toolkit_autodep: applet_spec["runSpec"].setdefault("execDepends", []) exec_depends = applet_spec["runSpec"]["execDepends"] if type(exec_depends) is not list or any( type(dep) is not dict for dep in exec_depends): raise AppBuilderException( "Expected runSpec.execDepends to be an array of objects") dx_toolkit_dep_found = any( dep.get('name') in DX_TOOLKIT_PKGS or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends) if not dx_toolkit_dep_found: exec_depends.append(dx_toolkit_dep) if dx_toolkit_autodep == "git": applet_spec.setdefault("access", {}) applet_spec["access"].setdefault("network", []) # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps if "*" not in applet_spec["access"]["network"]: applet_spec["access"]["network"].append("*") merge(applet_spec, kwargs) # ----- # Now actually create the applet if dry_run: print("Would create the following applet:") print(json.dumps(applet_spec, indent=2)) print("*** DRY-RUN-- no applet was created ***") return None, None if applet_spec.get("categories", []): if "tags" not in applet_spec: applet_spec["tags"] = [] applet_spec["tags"] = list( set(applet_spec["tags"]) | set(applet_spec["categories"])) applet_id = dxpy.api.applet_new(applet_spec)["id"] if archived_applet: archived_applet.set_properties({'replacedWith': applet_id}) # Now it is permissible to delete the old applet(s), if any if applets_to_overwrite: logger.info("Deleting applet(s) %s" % (','.join(applets_to_overwrite))) dxpy.DXProject(dest_project).remove_objects(applets_to_overwrite) return applet_id, applet_spec
def find_file(filePath, project=None, verbose=False, multiple=False, recurse=True): '''Using a DX style file path, find the file.''' proj = project path = filePath fileName = filePath if filePath.find(':') != -1: proj, path = filePath.split(':', 1) if path.rfind('/') != -1: path, fileName = path.rsplit('/', 1) else: fileName = path path = '/' if proj == None: if verbose: print "ERROR: Don't know what project to use for '" + path + "'." return None if proj.find('project-') == 0: projId = proj else: projId = get_project(proj, level='VIEW').get_id() mode = 'exact' if filePath.find('*') or filePath.find('?'): mode = 'glob' fileDicts = list( dxpy.find_data_objects(classname='file', folder=path, name=fileName, recurse=recurse, name_mode=mode, project=projId, return_handler=False)) if fileDicts == None or len(fileDicts) == 0: #print "- Found 0 files from '" + proj + ":" + filePath + "'." if verbose: print "ERROR: Failed to find '" + proj + ":" + filePath + "'." return None elif len(fileDicts) > 1 or multiple: #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'." if not multiple: if verbose: print "ERROR: Found " + str( len(fileDicts) ) + " files when expecting 1 '" + proj + ":" + filePath + "'." return None else: if verbose: print " Found " + str(len( fileDicts)) + " files for '" + proj + ":" + filePath + "'." fids = [] for fileDict in fileDicts: FILES[fileDict['id']] = dxpy.dxlink(fileDict) fids.append(fileDict['id']) return fids else: #print "- FOUND '" + proj + ":" + filePath + "'." FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0]) return fileDicts[0]['id']