def make_indexed_reference(job_inputs): logging.info("Indexing reference genome") run_shell("dx-contigset-to-fasta %s reference.fasta" % job_inputs['reference']['$dnanexus_link']) ref_details = dxpy.DXRecord(job_inputs['reference']['$dnanexus_link']).get_details() ref_name = dxpy.DXRecord(job_inputs['reference']['$dnanexus_link']).describe()['name'] # TODO: test if the genomes near the boundary work OK if sum(ref_details['contigs']['sizes']) < 2*1024*1024*1024: subprocess.check_call("bwa index -a is reference.fasta", shell=True) else: subprocess.check_call("bwa index -a bwtsw reference.fasta", shell=True) subprocess.check_call("XZ_OPT=-0 tar -cJf reference.tar.xz reference.fasta*", shell=True) indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True) indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for BWA)", types=["BwaLetterContigSetV3"], details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()), 'original_contigset': job_inputs['reference']}) indexed_ref_record.close() # TODO: dxpy project workspace convenience functions # FIXME # if "projectWorkspace" in job: # indexed_ref_record.clone(job["projectWorkspace"]) return indexed_ref_record
def make_indexed_reference(ref_ID): run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID) ref_details = dxpy.DXRecord(ref_ID).get_details() ref_name = dxpy.DXRecord(ref_ID).describe()['name'] # call bowtie2-build run_shell("bowtie2-build reference.fasta indexed_ref") # package it into an archive for uploading run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*") indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True) indexed_ref_record = dxpy.new_dxrecord( name=ref_name + " (indexed for Bowtie2)", types=["BowtieLetterContigSetV2"], details={ 'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()), 'original_contigset': dxpy.dxlink(ref_ID) }) indexed_ref_record.close() ''' # TODO: dxpy project workspace convenience functions if "projectWorkspace" in job: indexed_ref_record.clone(job["projectWorkspace"]) ''' return indexed_ref_record.get_id()
def __init__(self, record_link, fastqs=None): self.record_link = record_link.strip() link_elements = self.record_link.split(':') record_project = link_elements[0] record_dxid = link_elements[1] self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project) # Get record details self.details = self.record.get_details() self.project_id = self.details['laneProject'] self.mapping_reference = self.details['mappingReference'] self.lane_index = int(self.details['lane']) self.run_name = self.details['run'] self.run_date = self.run_name.split('_')[0] self.library_id = self.details['library_id'] self.lane_id = self.details['lane_id'] # Parse library name ("DL_set2_rep1 rcvd 1/4/16") library_label = self.details['library'] elements = library_label.split('rcvd') library_name = elements[0].rstrip() self.library_name = re.sub(r"[^a-zA-Z0-9]+", "-", library_name) # Get record properties self.properties = self.record.get_properties() self.mapper = self.properties['mapper'] self.reference_genome_dxid = self.properties['reference_genome_dxid'] self.reference_index_dxid = self.properties['reference_index_dxid'] self.flowcell_id = self.properties['flowcell_id'] self.fastq_dxids = fastqs
def __init__(self, record_link): self.record_link = record_link.strip() link_elements = self.record_link.split(':') self.record_project = link_elements[0] self.record_dxid = link_elements[1] self.record = dxpy.DXRecord(dxid=self.record_dxid, project=self.record_project) # Get relevant dashboard details self.details = self.record.get_details() self.run_name = self.details['run'] self.lane_index = self.details['lane'] self.library_name = self.details['library'] self.project_id = self.details['laneProject'] # Get relevant dashboard properties self.properties = self.record.get_properties() self.flowcell_id = self.properties['flowcell_id'] self.lab = self.properties['lab'] self.operator = 'None' # Still need to grab this info # Get mapping info for mapped lanes self.mapper = self.properties['mapper'] if self.mapper == 'None': self.mapper = None self.ref_genome_dxid = None self.reference_genome = None else: self.ref_genome_dxid = self.properties['reference_genome_dxid'] self.reference_genome = self.details['mappingReference']
def main(contig_set): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. contig_set = dxpy.DXRecord(contig_set) # The following line extracts the name from the file object so that # outputs can be named intelligently. It is not automatically generated by # the app wizard. name = contig_set.describe()['name'].replace(".fa", "") # Fill in your application code here. subprocess.check_call("dx-contigset-to-fasta %s %s.fa" % (contig_set.get_id(), name), shell=True) subprocess.check_call("gzip %s.fa" % name, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. fasta_gz = dxpy.upload_local_file("%s.fa.gz" % name); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["fasta_gz"] = dxpy.dxlink(fasta_gz) return output
def __init__(self, record_link): self.record_link = record_link.strip() link_elements = self.record_link.split(':') record_project = link_elements[0] record_dxid = link_elements[1] self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project) self.properties = self.record.get_properties() self.details = self.record.get_details() # Details (Used for Dashboard information) self.lane_project_id = self.details['laneProject'] self.run_name = self.details['run'] self.run_date = self.run_name.split('_')[0] self.lane_index = int(self.details['lane']) self.library_id = self.details['library_id'] self.lane_id = self.details['lane_id'] # Parse library name ("DL_set2_rep1 rcvd 1/4/16") library_label = self.details['library'] elements = library_label.split('rcvd') library_name = elements[0].rstrip() self.library_name = re.sub(r"[^a-zA-Z0-9]+", "-", library_name) # Properties self.lims_url = self.properties['lims_url'] self.lims_token = self.properties['lims_token'] self.rta_version = self.properties['rta_version'] self.seq_instrument = self.properties['seq_instrument'] self.flowcell_id = self.properties['flowcell_id'] self.lane_project = dxpy.DXProject(dxid=self.lane_project_id) self.home = os.getcwd() self.sample_sheet = None self.output_dir = None self.bcl2fastq_version = None self.lane_barcode = None self.flowcell_id = None # Choose bcl2fastq version based on rta_version ## DEV: Update version to match official documentation: i.e. 1.18.54 or later if StrictVersion(self.rta_version) < StrictVersion('1.18.54'): self.bcl2fastq_version = 1 elif StrictVersion(self.rta_version) >= StrictVersion('1.18.54'): self.bcl2fastq_version = 2 # Get barcode information (codepoint + name) from LIMS # Used to add barcode name to FastQ files self.connection = Connection(lims_url=self.lims_url, lims_token=self.lims_token) self.run_info = RunInfo(conn=self.connection, run=self.run_name) self.lane_info = self.run_info.get_lane(self.lane_index) self.barcode_dict = {} barcode_list = self.lane_info['barcodes'] for barcode_info in barcode_list: self.barcode_dict[barcode_info['codepoint']] = barcode_info['name']
def __init__(self, project_dxid, record_link, dx_user_id, user_first_name, user_last_name, user_email, viewers, release_note, lims_url, lims_token): # This is lane level stuff. Most of this info will be stored in dxrecord. if record_link: self.record_link = record_link.strip() link_elements = self.record_link.split(':') record_project = link_elements[0] record_dxid = link_elements[1] self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project) else: self.record = None self.project_dxid = project_dxid self.dx_user_id = dx_user_id self.user_first_name = user_first_name self.user_last_name = user_last_name self.user_email = user_email self.viewers = viewers self.release_note = release_note self.lims_url = lims_url self.lims_token = lims_token # Values assigned during project transfer self.sponsored_datetime = None self.release_project_dxid = None self.clone_project_dxid = None # Values gotten from DXRecord self.properties = None self.details = None self.lane_index = None self.run_name = None self.library_name = None self.production = None self.lab = None if self.record: self.properties = self.record.get_properties() self.details = self.record.get_details() self.parse_record_details() self.parse_record_properties()
def render_bundleddepends(thing): from ..bindings.search import find_one_data_object from ..exceptions import DXError bundles = [] for item in thing: bundle_asset_record = dxpy.DXFile(item["id"]["$dnanexus_link"]).get_properties().get("AssetBundle") asset = None if bundle_asset_record: asset = dxpy.DXRecord(bundle_asset_record) if asset: try: bundles.append(asset.describe().get("name") + " (" + asset.get_id() + ")") except DXError: asset = None if not asset: bundles.append(item["name"] + " (" + item["id"]["$dnanexus_link"] + ")") return bundles
def __init__(self, record_link): self.record_link = record_link.strip() link_elements = self.record_link.split(':') record_project = link_elements[0] record_dxid = link_elements[1] self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project) # Get relevant dashboard details self.details = self.record.get_details() self.run_name = self.details['run'] self.lane_index = self.details['lane'] self.library_name = self.details['library'] self.project_dxid = self.details['laneProject'] # Get relevant dashboard properties self.properties = self.record.get_properties() self.details = self.record.get_details() self.flowcell_id = self.properties['flowcell_id'] self.lab = self.properties['lab'] self.operator = 'None' # Still need to grab this info # Boolean indicating whether project is part of production pipeline self.is_production = None production = self.properties['production'] if production == 'true': self.is_production = True else: self.is_production = False # Get mapping info for mapped lanes self.mapper = self.properties['mapper'] if self.mapper == 'None': self.mapper = None self.ref_genome_dxid = None self.reference_genome = None else: self.ref_genome_dxid = self.properties['reference_genome_dxid'] self.reference_genome = self.details['mappingReference']
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(parser.parse_args(sys.argv[1:])) # Attempt to resolve variants gtable name try: project, folderpath, entity_result = resolve_existing_path( kwargs['path'], expected='entity') except ResolutionError as details: parser.exit(1, fill(str(details)) + '\n') if entity_result is None: parser.exit( 1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n') filename = kwargs['output'] if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf" if kwargs['output'] == '-': outputFile = sys.stdout else: outputFile = open(filename, 'w') exportRef = kwargs['export_ref_calls'] exportNoCall = kwargs['export_no_calls'] variantsTable = dxpy.open_dxgtable(entity_result['id']) try: originalContigSet = variantsTable.get_details()['original_contigset'] except: raise dxpy.AppError( "The original reference genome must be attached as a detail") contigDetails = dxpy.DXRecord(originalContigSet).get_details() if kwargs['reference'] is not None: refFileName = kwargs['reference'] if not os.path.isfile(refFileName): raise dxpy.AppError( "The reference expected by the variants to vcf script was not a valid file" ) else: refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name dxpy.download_dxfile( contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName) if kwargs['write_header']: infos = variantsTable.get_details().get('infos') formats = variantsTable.get_details().get('formats') alts = variantsTable.get_details().get('alts') filters = variantsTable.get_details().get('filters') samples = variantsTable.get_details().get('samples') outputFile.write("##fileformat=VCFv4.1\n") if infos is not None: for k, v in collections.OrderedDict(sorted( infos.iteritems())).iteritems(): outputFile.write("##INFO=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if len(samples) > 0: outputFile.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" ) outputFile.write( "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n" ) outputFile.write( "##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n" ) if formats is not None: for k, v in collections.OrderedDict(sorted( formats.iteritems())).iteritems(): outputFile.write("##FORMAT=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if alts is not None: for k, v in collections.OrderedDict(sorted( alts.iteritems())).iteritems(): outputFile.write("##ALT=<ID=" + k + ",Description=\"" + v['description'] + "\">\n") if filters is not None: for k, v in collections.OrderedDict(sorted( filters.iteritems())).iteritems(): outputFile.write("##FILTER=<ID=" + k + ",Description=\"" + v + "\">\n") for i in range(len(contigDetails['contigs']['names'])): outputFile.write("##contig=<ID=" + contigDetails['contigs']['names'][i] + ",length=" + str(contigDetails['contigs']['sizes'][i]) + ">\n") outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") if len(samples) > 0: outputFile.write("\tFORMAT") for x in samples: outputFile.write("\t" + x) outputFile.write("\n") chromosomeOffsets = {} for i in range(len(contigDetails['contigs']['names'])): chromosomeOffsets[contigDetails['contigs']['names'] [i]] = contigDetails['contigs']['offsets'][i] contigSequence = open(refFileName, 'r').read() col = {} names = variantsTable.get_col_names() for i in range(len(names)): col[names[i]] = i + 1 col = collections.OrderedDict(sorted(col.items())) chromosomeList = contigDetails['contigs']['names'] if kwargs['chr'] is not None: intersection = [] for x in chromosomeList: if x in kwargs['chr']: intersection.append(x) chromosomeList = intersection[:] for chromosome in chromosomeList: buff = [] lastPosition = -1 query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxsize) for row in variantsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in variantsTable.iterate_rows(start=startRow): if row[1] != chromosome: break if lastPosition < row[col["lo"]]: writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = [] buff.append(row) lastPosition = row[col["lo"]] writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = []
def _dump_app_or_applet(executable, omit_resources=False, describe_output={}): info = executable.get() if info["runSpec"]["interpreter"] == "bash": suffix = "sh" elif info["runSpec"]["interpreter"] in ["python2.7", "python3", "python3.5"]: suffix = "py" else: print('Sorry, I don\'t know how to get executables with interpreter ' + info["runSpec"]["interpreter"] + '\n', file=sys.stderr) sys.exit(1) # Entry point script script = "src/code.%s" % (suffix,) os.mkdir("src") with open(script, "w") as f: f.write(info["runSpec"]["code"]) def make_cluster_bootstrap_script_file(region, entry_point, code, suffix): """ Writes the string `code` into a file at the relative path "src/<region>_<entry_point>_clusterBootstrap.<suffix>" """ script_name = "src/%s_%s_clusterBootstrap.%s" % (region, entry_point, suffix) with open(script_name, "w") as f: f.write(code) return script_name # Get all the asset bundles asset_depends = [] deps_to_remove = [] # When an applet is built bundledDepends are added in the following order: # 1. bundledDepends explicitly specified in the dxapp.json # 2. resources (contents of resources directory added as bundledDepends) # 3. assetDepends (translated into bundledDepends) # # Therefore while translating bundledDepends to assetDepends, we are traversing the # list in reverse order and exiting when we can't find the "AssetBundle" property # with the tarball file. # # NOTE: If last item (and contiguous earlier items) of bundledDepends (#1 above) refers to an # AssetBundle tarball, those items will be converted to assetDepends. # # TODO: The bundledDepends should be annotated with another field called {"asset": true} # to distinguish it from non assets. It will be needed to annotate the bundleDepends, # when the wrapper record object is no more accessible. for dep in reversed(info["runSpec"]["bundledDepends"]): file_handle = get_handler(dep["id"]) if isinstance(file_handle, dxpy.DXFile): asset_record_id = file_handle.get_properties().get("AssetBundle") asset_record = None if asset_record_id: asset_record = dxpy.DXRecord(asset_record_id) if asset_record: try: asset_json = {"name": asset_record.describe().get("name"), "project": asset_record.get_proj_id(), "folder": asset_record.describe().get("folder"), "version": asset_record.describe(fields={"properties": True} )["properties"]["version"] } if dep.get("stages"): asset_json["stages"] = dep["stages"] asset_depends.append(asset_json) deps_to_remove.append(dep) except DXError: print("Describe failed on the assetDepends record object with ID - " + asset_record_id + "\n", file=sys.stderr) pass else: break # Reversing the order of the asset_depends[] so that original order is maintained asset_depends.reverse() # resources/ directory created_resources_directory = False if not omit_resources: for dep in info["runSpec"]["bundledDepends"]: if dep in deps_to_remove: continue handler = get_handler(dep["id"]) if isinstance(handler, dxpy.DXFile): if not created_resources_directory: os.mkdir("resources") created_resources_directory = True handler_id = handler.get_id() fname = "resources/%s.tar.gz" % (handler_id) download_dxfile(handler_id, fname) print("Unpacking resources", file=sys.stderr) def untar_strip_leading_slash(tarfname, path): t = tarfile.open(tarfname) for m in t.getmembers(): if m.name.startswith("/"): m.name = m.name[1:] t.extract(m, path) t.close() untar_strip_leading_slash(fname, "resources") os.unlink(fname) deps_to_remove.append(dep) # TODO: if output directory is not the same as executable name we # should print a warning and/or offer to rewrite the "name" # field in the 'dxapp.json' dxapp_json = collections.OrderedDict() all_keys = executable._get_required_keys() + executable._get_optional_keys() for key in all_keys: if key in executable._get_describe_output_keys() and key in describe_output: dxapp_json[key] = describe_output[key] if key in info: dxapp_json[key] = info[key] if info.get("hidden", False): dxapp_json["hidden"] = True # TODO: inputSpec and outputSpec elements should have their keys # printed in a sensible (or at least consistent) order too # Un-inline code del dxapp_json["runSpec"]["code"] dxapp_json["runSpec"]["file"] = script # Remove resources from bundledDepends for dep in deps_to_remove: dxapp_json["runSpec"]["bundledDepends"].remove(dep) # Add assetDepends to dxapp.json if len(asset_depends) > 0: dxapp_json["runSpec"]["assetDepends"] = asset_depends # Ordering input/output spec keys ordered_spec_keys = ("name", "label", "help", "class", "type", "patterns", "optional", "default", "choices", "suggestions", "group") for spec_key in "inputSpec", "outputSpec": if spec_key not in dxapp_json.keys(): continue for i, spec in enumerate(dxapp_json[spec_key]): ordered_spec = collections.OrderedDict() # Adding keys, for which the ordering is defined for key in ordered_spec_keys: if key in spec.keys(): ordered_spec[key] = spec[key] # Adding the rest of the keys for key in spec.keys(): if key not in ordered_spec_keys: ordered_spec[key] = spec[key] dxapp_json[spec_key][i] = ordered_spec # Remove dx-toolkit from execDepends dx_toolkit = {"name": "dx-toolkit", "package_manager": "apt"} if dx_toolkit in dxapp_json["runSpec"].get("execDepends", ()): dxapp_json["runSpec"]["execDepends"].remove(dx_toolkit) # Remove "bundledDependsByRegion" field from "runSpec". This utility # will reconstruct the resources directory based on the # "bundledDepends" field, which should be equivalent to # "bundledDependsByRegion". dxapp_json["runSpec"].pop("bundledDependsByRegion", None) # "dx build" parses the "regionalOptions" key from dxapp.json into the # "runSpec.systemRequirements" field of applet/new. # "dx get" should parse the "systemRequirementsByRegion" field from # the response of /app-x/get or /applet-x/get into the "regionalOptions" # key in dxapp.json. if "systemRequirementsByRegion" in dxapp_json['runSpec']: dxapp_json["regionalOptions"] = {} for region in dxapp_json['runSpec']["systemRequirementsByRegion"]: region_sys_reqs = dxapp_json['runSpec']['systemRequirementsByRegion'][region] # handle cluster bootstrap scripts if any are present for entry_point in region_sys_reqs: try: bootstrap_script = region_sys_reqs[entry_point]['clusterSpec']['bootstrapScript'] filename = make_cluster_bootstrap_script_file(region, entry_point, bootstrap_script, suffix) region_sys_reqs[entry_point]['clusterSpec']['bootstrapScript'] = filename except KeyError: # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec" continue dxapp_json["regionalOptions"][region] = \ dict(systemRequirements=region_sys_reqs) # systemRequirementsByRegion data is stored in regionalOptions, # systemRequirements is ignored if 'systemRequirementsByRegion' in dxapp_json["runSpec"]: del dxapp_json["runSpec"]["systemRequirementsByRegion"] if 'systemRequirements' in dxapp_json["runSpec"]: del dxapp_json["runSpec"]["systemRequirements"] # Cleanup of empty elements. Be careful not to let this step # introduce any semantic changes to the app specification. For # example, an empty input (output) spec is not equivalent to a # missing input (output) spec. if 'runSpec' in dxapp_json: _recursive_cleanup(dxapp_json['runSpec']) if 'access' in dxapp_json: _recursive_cleanup(dxapp_json['access']) for key in executable._get_cleanup_keys(): if key in dxapp_json and not dxapp_json[key]: del dxapp_json[key] readme = info.get("description", "") devnotes = info.get("developerNotes", "") # Write dxapp.json, Readme.md, and Readme.developer.md _write_json_file("dxapp.json", dxapp_json) if readme: _write_simple_file("Readme.md", readme) if devnotes: _write_simple_file("Readme.developer.md", devnotes)
def main(): argparser = argparse.ArgumentParser( description="Build a dxCompiler release") argparser.add_argument("--force", help="Build even if there is an existing version", action='store_true', default=False) argparser.add_argument("--multi-region", help="Copy to all supported regions", action='store_true', default=False) argparser.add_argument("--dry-run", help="Don't build any artifacts", action='store_true', default=False) args = argparser.parse_args() # build multi-region jar for releases, or # if explicitly specified multi_region = args.multi_region # Choose which dictionary to use if multi_region: project_dict = RELEASE_DICT else: project_dict = TEST_DICT project = util.get_project(project_dict[HOME_REGION]) print("project: {} ({})".format(project.name, project.get_id())) # Figure out what the current version is version_id = util.get_version_id(top_dir) print("version: {}".format(version_id)) # Set the folder folder = "/releases/{}".format(version_id) print("folder: {}".format(folder)) if args.dry_run: args.force = False # remove the existing directory paths if args.force: for proj_name in project_dict.values(): print("removing path {}:{}".format(proj_name, folder)) dx_proj = util.get_project(proj_name) try: dx_proj.remove_folder(folder, recurse=True) except dxpy.DXError: pass # Make sure the target directory exists project.new_folder(folder, parents=True) # Build the asset, and the compiler jar file. path_dict = dict( map(lambda kv: (kv[0], kv[1] + ":" + folder), project_dict.items())) if args.dry_run: return home_ad = util.build(project, folder, version_id, top_dir, path_dict) if multi_region: for lang, asset_desc in home_ad.asset_ids.items(): home_rec = dxpy.DXRecord(asset_desc) all_regions = project_dict.keys() # Leave only regions where the asset is missing target_regions = [] for dest_region in all_regions: dest_proj = util.get_project(project_dict[dest_region]) dest_asset = util.find_asset(dest_proj, folder, lang) if dest_asset == None: target_regions.append(dest_region) _clone_asset(home_rec, folder, target_regions, project_dict)
def dump_executable(executable, destination_directory, omit_resources=False, describe_output=[]): """ Reconstitutes executable into a directory that would create a functionally identical executable if "dx build" were run on it. destination_directory will be the root source directory for the applet. :param executable: executable, i.e. app or applet, to be dumped :type executable: DXExecutable (only DXApp or DXApplet now) :param destination_directory: an existing, empty, and writable directory :type destination_directory: str """ old_cwd = os.getcwd() os.chdir(destination_directory) try: info = executable.get() if info["runSpec"]["interpreter"] == "bash": suffix = "sh" elif info["runSpec"]["interpreter"] == "python2.7": suffix = "py" else: print( 'Sorry, I don\'t know how to get executables with interpreter ' + info["runSpec"]["interpreter"] + '\n', file=sys.stderr) sys.exit(1) # Entry point script script = "src/code.%s" % (suffix, ) os.mkdir("src") with open(script, "w") as f: f.write(info["runSpec"]["code"]) # Get all the asset bundles asset_depends = [] deps_to_remove = [] # When an applet is built bundledDepends are added in the following order: # 1. bundledDepends explicitly specified in the dxapp.json # 2. resources (contents of resources directory added as bundledDepends) # 3. assetDepends (translated into bundledDepends) # # Therefore while translating bundledDepends to assetDepends, we are traversing the # list in reverse order and exiting when we can't find the "AssetBundle" property # with the tarball file. # # NOTE: If last item (and contiguous earlier items) of bundledDepends (#1 above) refers to an # AssetBundle tarball, those items will be converted to assetDepends. # # TODO: The bundledDepends should be annotated with another field called {"asset": true} # to distinguish it from non assets. It will be needed to annotate the bundleDepends, # when the wrapper record object is no more accessible. for dep in reversed(info["runSpec"]["bundledDepends"]): file_handle = get_handler(dep["id"]) if isinstance(file_handle, dxpy.DXFile): asset_record_id = file_handle.get_properties().get( "AssetBundle") asset_record = None if asset_record_id: asset_record = dxpy.DXRecord(asset_record_id) if asset_record: try: asset_depends.append({ "name": asset_record.describe().get("name"), "project": asset_record.get_proj_id(), "folder": asset_record.describe().get("folder"), "version": asset_record.describe( fields={"properties": True})["properties"] ["version"] }) deps_to_remove.append(dep) except DXError: print( "Describe failed on the assetDepends record object with ID - " + asset_record_id + "\n", file=sys.stderr) pass else: break # Reversing the order of the asset_depends[] so that original order is maintained asset_depends.reverse() # resources/ directory created_resources_directory = False if not omit_resources: for dep in info["runSpec"]["bundledDepends"]: if dep in deps_to_remove: continue handler = get_handler(dep["id"]) if isinstance(handler, dxpy.DXFile): if not created_resources_directory: os.mkdir("resources") created_resources_directory = True handler_id = handler.get_id() fname = "resources/%s.tar.gz" % (handler_id) download_dxfile(handler_id, fname) print("Unpacking resources", file=sys.stderr) tar = tarfile.open(fname) tar.extractall("resources") tar.close() os.unlink(fname) deps_to_remove.append(dep) # TODO: if output directory is not the same as executable name we # should print a warning and/or offer to rewrite the "name" # field in the 'dxapp.json' dxapp_json = collections.OrderedDict() all_keys = executable._get_required_keys( ) + executable._get_optional_keys() for key in all_keys: if key in executable._get_describe_output_keys( ) and key in describe_output: dxapp_json[key] = describe_output[key] if key in info: dxapp_json[key] = info[key] if info.get("hidden", False): dxapp_json["hidden"] = True # TODO: inputSpec and outputSpec elements should have their keys # printed in a sensible (or at least consistent) order too # Un-inline code del dxapp_json["runSpec"]["code"] dxapp_json["runSpec"]["file"] = script # Remove resources from bundledDepends for dep in deps_to_remove: dxapp_json["runSpec"]["bundledDepends"].remove(dep) # Add assetDepends to dxapp.json if len(asset_depends) > 0: dxapp_json["runSpec"]["assetDepends"] = asset_depends # Ordering input/output spec keys ordered_spec_keys = ("name", "label", "help", "class", "type", "patterns", "optional", "default", "choices", "suggestions", "group") for spec_key in "inputSpec", "outputSpec": if spec_key not in dxapp_json.keys(): continue for i, spec in enumerate(dxapp_json[spec_key]): ordered_spec = collections.OrderedDict() # Adding keys, for which the ordering is defined for key in ordered_spec_keys: if key in spec.keys(): ordered_spec[key] = spec[key] # Adding the rest of the keys for key in spec.keys(): if key not in ordered_spec_keys: ordered_spec[key] = spec[key] dxapp_json[spec_key][i] = ordered_spec # Remove dx-toolkit from execDepends dx_toolkit = {"name": "dx-toolkit", "package_manager": "apt"} if dx_toolkit in dxapp_json["runSpec"]["execDepends"]: dxapp_json["runSpec"]["execDepends"].remove(dx_toolkit) # Remove "bundledDependsByRegion" field from "runSpec". This utility # will reconstruct the resources directory based on the # "bundledDepends" field, which should be equivalent to # "bundledDependsByRegion". dxapp_json["runSpec"].pop("bundledDependsByRegion", None) # Cleanup of empty elements. Be careful not to let this step # introduce any semantic changes to the app specification. For # example, an empty input (output) spec is not equivalent to a # missing input (output) spec. if 'runSpec' in dxapp_json: _recursive_cleanup(dxapp_json['runSpec']) if 'access' in dxapp_json: _recursive_cleanup(dxapp_json['access']) for key in executable._get_cleanup_keys(): if key in dxapp_json and not dxapp_json[key]: del dxapp_json[key] readme = info.get("description", "") devnotes = info.get("developerNotes", "") # Write dxapp.json, Readme.md, and Readme.developer.md with open("dxapp.json", "w") as f: f.write( flatten_json_array( json.dumps(dxapp_json, indent=2, separators=(',', ': ')), "patterns")) f.write('\n') if readme: with open("Readme.md", "w") as f: f.write(readme) if devnotes: with open("Readme.developer.md", "w") as f: f.write(devnotes) except: err_exit() finally: os.chdir(old_cwd)
def main(**job_inputs): output = {} reportInput = {} run_shell("dx-spans-to-bed --output genes.bed " + job_inputs["gene_model"]["$dnanexus_link"]) bed_id = dxpy.upload_local_file("genes.bed").get_id() mappings_id = job_inputs["mappings"]["$dnanexus_link"] # get contaminant mapping started if we're doing it: if "contaminants" in job_inputs: if not "original_reads" in job_inputs: raise dxpy.AppError( "Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings" ) name_input = [] contam_input = [] #spawn mappings job for each ContigSet for contaminant in job_inputs['contaminants']: calc_job = map_contaminant(Reads=job_inputs['original_reads'], Contig=contaminant) name_input.append(dxpy.DXRecord(contaminant).describe()['name']) contam_input.append({"job": calc_job, "field": "percent_mapped"}) reportInput['contam'] = contam_input reportInput['names'] = name_input else: reportInput['contam'] = None reportInput['names'] = None # output mappings as SAM for analysis modules run_shell(" ".join([ "dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam", mappings_id ])) run_shell(" ".join( ["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"])) bam_id = dxpy.upload_local_file("mappings.bam", wait_on_close=True).get_id() job1 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "geneBody_coverage") # if paired then do inner distance calculation if "chr2" in dxpy.DXGTable(mappings_id).get_col_names(): job2 = dxpy.new_dxjob( { 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "inner_distance") else: job2 = None job3 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "junction_annotation") job4 = dxpy.new_dxjob({"BAM_file": dxpy.dxlink(bam_id)}, "read_duplication") # implement this one when we can request a large RAM instance - requires 19GB for human genome job5 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "read_distribution") # {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} ) reportInput['geneBody'] = {"job": job1.get_id(), "field": "results"} if job2 != None: reportInput['inner_dist'] = {"job": job2.get_id(), "field": "results"} else: reportInput['inner_dist'] = None reportInput['junc_ann'] = {"job": job3.get_id(), "field": "results"} reportInput['read_dup'] = {"job": job4.get_id(), "field": "results"} reportInput['read_dist'] = {"job": job5.get_id(), "field": "results"} reportInput['mappings'] = job_inputs["mappings"] reportJob = dxpy.new_dxjob(reportInput, "generate_report") output['report'] = {"job": reportJob.get_id(), "field": "Report"} return output
def main(): '''Get lane, base, and read stats for the given month. ''' args = parse_args(sys.argv[1:]) outfile = args.outfile if args.cron: now = datetime.datetime.now() year = now.year month = now.month - 1 print 'Info: Collecting metrics for %d-%d' % (year, month) else: year = args.year month = args.month monthly_outfile = '%d-%d_seq-stats.txt' % (year, month) monthly_metrics = defaultdict() ''' monthly_metrics = { 'lane_count' : 0, 'base_count' : 0, 'read_count' : 0 } ''' # Dev: NEED TO QC THIS after_date = '%d-%d-01' % (year, month) if month > 1 and month < 12: before_date = '%d-%d-01' % (year, int(month + 1)) elif month == 12: before_date = '%d-1-01' % int(year + 1) else: print 'Error: invalid month %d' % month sys.exit() print 'After: %s' % after_date print 'Before: %s ' % before_date monthly_records = dxpy.find_data_objects( classname='record', project='project-BY82j6Q0jJxgg986V16FQzjx', folder='/', typename='SCGPMRun', created_after=after_date, created_before=before_date) MOUT = open(monthly_outfile, 'w') for record in monthly_records: sequencer_type = None print record['id'] dxrecord = dxpy.DXRecord(record['id'], record['project']) lane_details = dxrecord.get_details() lane_properties = dxrecord.get_properties() try: production = lane_properties['production'] if production == 'false': continue except: print 'Skipping: not production' continue dxproject = lane_details['laneProject'] lane_index = int(lane_details['lane']) run_name = str(lane_details['run']) paired_end = bool(lane_properties['paired_end']) #try: # sequencer_type = str(lane_properties['sequencer_type']) #except: seq_instrument = str(lane_properties['seq_instrument']) sequencer_type = classify_instrument(seq_instrument) #sequencer_type = sequencer_type.replace(' ', '_') # Create monthly_records defaultdict to store cumulative stats if not sequencer_type in monthly_metrics.keys(): monthly_metrics[sequencer_type] = { 'lane_count': 0, 'base_count': 0, 'read_count': 0 } lane_name = '%s_L%d' % (run_name, lane_index) print 'Processing %s' % lane_name try: html_file = dxpy.find_one_data_object( classname='file', name='*.lane.html', name_mode='glob', project=dxproject, folder='/stage0_bcl2fastq/miscellany', more_ok=True, zero_ok=False) except: print 'Warning: Could not get lane.html file. Skipping' continue html_dxfile = dxpy.DXFile(html_file['id'], html_file['project']) lane_metrics = parse_lane_html(html_dxfile, lane_index, lane_name) pf_clusters = int(lane_metrics['pf_clusters'].replace(',', '')) if paired_end: read_count = pf_clusters * 2 else: read_count = pf_clusters mbase_count = int(lane_metrics['yield_mbases'].replace(',', '')) base_count = mbase_count * 1000000 monthly_metrics[sequencer_type]['lane_count'] += 1 monthly_metrics[sequencer_type]['base_count'] += base_count monthly_metrics[sequencer_type]['read_count'] += read_count # Write individual record data to the monthly out file. # One record/lane per line. #pdb.set_trace() mout_str = ('{}\t'.format(year) + '{}\t'.format(month) + '{}\t'.format(run_name) + '{}\t'.format(lane_index) + '{}\t'.format(read_count) + '{}\t'.format(base_count) + '{}\n'.format(sequencer_type)) MOUT.write(mout_str) MOUT.close() # Add header to new outfile if not os.path.isfile(outfile): with open(outfile, 'w') as OUT: OUT.write( 'Year\tMonth\tLane_Count\tRead_Count\tBase_Count\tSeq_Type\n') # Write monthly metrics to outfile with open(outfile, 'a') as OUT: '''Old string formatting method out_str = '%d\t%d\t%d\t%d\t%d\n' % ( year, month, monthly_metrics['lane_count'], monthly_metrics['read_count'], monthly_metrics['base_count'], sequencer_type) ''' #pdb.set_trace() for sequencer_type in monthly_metrics.keys(): out_str = ( '{}\t'.format(year) + '{}\t'.format(month) + '{}\t'.format(monthly_metrics[sequencer_type]['lane_count']) + '{}\t'.format(monthly_metrics[sequencer_type]['read_count']) + '{}\t'.format(monthly_metrics[sequencer_type]['base_count']) + '{}\n'.format(sequencer_type)) OUT.write(out_str) out_prefix = outfile.split('.')[0]
def main(**kwargs): if len(kwargs) == 0: opts = parser.parse_args(sys.argv[1:]) else: opts = parser.parse_args(kwargs) if opts.mappings_id == None: parser.print_help() sys.exit(1) mappingsTable = dxpy.DXGTable(opts.mappings_id) idAsName = opts.id_as_name idPrepend = opts.id_prepend writeRowId = opts.write_row_id paired = "chr2" in mappingsTable.get_col_names() regions = [] if opts.region_file != "": regions = re.findall("-L ([^:]*):(\d+)-(\d+)", open(opts.region_file, 'r').read()) name = mappingsTable.describe()['name'] if opts.reference != None: originalContig = opts.reference else: try: originalContig = mappingsTable.get_details( )['original_contigset']['$dnanexus_link'] except: raise dxpy.AppError( "The original reference genome must be attached to mappings table" ) try: contigDetails = dxpy.DXRecord(originalContig).get_details()['contigs'] except: raise dxpy.AppError("Unable to access reference with ID " + originalContig) contigNames = contigDetails['names'] contigSizes = contigDetails['sizes'] if opts.file_name != None: outputFile = open(opts.file_name, 'w') else: outputFile = None header = "" for i in range(len(contigNames)): header += "@SQ\tSN:" + str(contigNames[i]) + "\tLN:" + str( contigSizes[i]) + "\n" assignReadGroup = opts.assign_read_group if assignReadGroup != "": header += "@RG\tID:" + assignReadGroup + "\tSM:Sample_0" else: for i in range(len(mappingsTable.get_details()['read_groups'])): header += "@RG\tID:" + str(i) + "\tSM:Sample_" + str(i) if opts.read_group_platform != '': header += "\tPL:" + opts.read_group_platform header += "\n" if outputFile != None: outputFile.write(header) else: sys.stdout.write(header) col = {} names = mappingsTable.get_col_names() for i in range(len(names)): col[names[i]] = i + 1 column_descs = mappingsTable.describe()['columns'] sam_cols = [] sam_col_names = [] sam_col_types = {} for c in column_descs: if c['name'].startswith( "sam_field_") or c['name'] == "sam_optional_fields": sam_cols.append(c) sam_col_names.append(c['name']) sam_col_types[c['name']] = c['type'] defaultCol = { "sequence": "", "name": "", "quality": "", "status": "UNMAPPED", "chr": "", "lo": 0, "hi": 0, "negative_strand": False, "error_probability": 0, "qc_fail": False, "duplicate": False, "cigar": "", "mate_id": -1, "status2": "", "chr2": "", "lo2": 0, "hi2": 0, "negative_strand2": False, "proper_pair": False, "read_group": 0 } #unmappedFile = open("unmapped.txt", 'w') if len(regions) == 0: if opts.start_row > mappingsTable.describe()['length']: raise dxpy.AppError( "Starting row is larger than number of rows in table") elif opts.end_row < opts.start_row: raise dxpy.AppError("Ending row is before Start") if opts.end_row > 0: generator = mappingsTable.iterate_rows(start=opts.start_row, end=opts.end_row, want_dict=True) else: generator = mappingsTable.iterate_rows(start=opts.start_row, want_dict=True) # write each row unless we're throwing out unmapped for row in generator: if row["status"] != "UNMAPPED" or opts.discard_unmapped == False: if not paired: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal and row["chr"] == row["chr2"]: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.only_interchromosomal and opts.no_interchromosomal == False and ( row["chr"] != row["chr2"] or (row["chr"] == "" and row["chr2"] == "")): writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal == False and opts.only_interchromosomal == False: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) else: for x in regions: # generate the query for this region query = mappingsTable.genomic_range_query( x[0], int(x[1]) + opts.region_index_offset, int(x[2]) + opts.region_index_offset, index='gri') for row in mappingsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in mappingsTable.iterate_rows(start=startRow, want_dict=True): if row["chr"] != x[0] or row["lo"] > int( x[2]) + opts.region_index_offset: break if row["status"] != "UNMAPPED" or opts.discard_unmapped == False: if not paired: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal and row["chr"] == row[ "chr2"]: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.only_interchromosomal and opts.no_interchromosomal == False and ( row["chr"] != row["chr2"] or (row["chr"] == "" and row["chr2"] == "")): writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal == False and opts.only_interchromosomal == False: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) if outputFile != None: outputFile.close()
def main(): argparser = argparse.ArgumentParser(description="Build a dxWDL release") argparser.add_argument( "--force", help="Build even if the there is an existing version", action='store_true', default=False) argparser.add_argument("--multi-region", help="Copy to all supported regions", action='store_true', default=False) args = argparser.parse_args() # build multi-region jar for releases, or # if explicitly specified multi_region = args.multi_region # Choose which dictionary to use if multi_region: project_dict = RELEASE_DICT else: project_dict = TEST_DICT project = util.get_project(project_dict[HOME_REGION]) print("project: {} ({})".format(project.name, project.get_id())) # Figure out what the current version is version_id = util.get_version_id(top_dir) print("version: {}".format(version_id)) # Set the folder folder = "/releases/{}".format(version_id) print("folder: {}".format(folder)) # remove the existing directory paths if args.force: for proj_name in project_dict.values(): print("removing path {}:{}".format(proj_name, folder)) dx_proj = util.get_project(proj_name) try: dx_proj.remove_folder(folder, recurse=True) except dxpy.DXError: pass # Make sure the target directory exists project.new_folder(folder, parents=True) # Build the asset, and the compiler jar file. path_dict = dict( map(lambda kv: (kv[0], kv[1] + ":" + folder), project_dict.items())) (jar_path, home_ad) = util.build(project, folder, version_id, top_dir, path_dict) if multi_region: # download dxWDL runtime library home_rec = dxpy.DXRecord(home_ad.asset_id) fid = home_rec.get_details()['archiveFileId']['$dnanexus_link'] fn = dxpy.describe(fid)['name'] rtlib_path = "/tmp/{}".format(fn) print("Download asset file {}".format(fn)) dxpy.download_dxfile(fid, rtlib_path, show_progress=True) # copy to all other regions for region in project_dict.keys(): if region != home_ad.region: proj = project_dict[region] if proj is None: raise Exception( "No project configured for region {}".format(region)) dest_proj = util.get_project(proj) if dest_proj is not None: dest_ad = util.copy_across_regions(rtlib_path, home_rec, region, dest_proj, folder) else: print("No project named {}".format(proj)) # Upload compiler jar file util.upload_local_file(jar_path, project, folder)
def main(): argparser = argparse.ArgumentParser(description="Build the dxWDL jar file") argparser.add_argument("--folder", help="Destination folder") argparser.add_argument("--multi-region", help="Copy to all supported regions", action='store_true', default=False) argparser.add_argument("--release", help="Create a dxWDL release, implies multi-region", action='store_true', default=False) args = argparser.parse_args() # resolve project project_dict = None if args.release: project_dict = RELEASE_DICT else: project_dict = TEST_DICT project = util.get_project(project_dict[HOME_REGION]) print("project: {} ({})".format(project.name, project.get_id())) # Set the folder, build one if necessary if args.folder is not None: folder = args.folder elif args.release: folder = time.strftime("/releases/%Y-%m-%d/%H%M%S") project.new_folder(folder, parents=True) else: folder = time.strftime("/builds/%Y-%m-%d/%H%M%S") project.new_folder(folder, parents=True) print("folder: {}".format(folder)) # build multi-region jar for releases, or # if explicitly specified multi_region = args.multi_region if args.release: multi_region = True # Figure out what the current version is version_id = util.get_version_id(top_dir) print("version: {}".format(version_id)) # build the asset home_ad = util.build(project, folder, version_id, top_dir) ad_all = [home_ad] if multi_region: # download dxWDL runtime library home_rec = dxpy.DXRecord(home_ad.asset_id) fid = home_rec.get_details()['archiveFileId']['$dnanexus_link'] fn = dxpy.describe(fid)['name'] rtlib_path = "/tmp/{}".format(fn) print("Download asset file {}".format(fn)) dxpy.download_dxfile(fid, rtlib_path, show_progress=True) # copy to all other regions for region in project_dict.keys(): if region != home_ad.region: proj = project_dict[region] if proj is None: raise Exception( "No project configured for region {}".format(region)) dest_proj = util.get_project(proj) dest_ad = util.copy_across_regions(rtlib_path, home_rec, region, dest_proj, folder) ad_all.append(dest_ad) # build the final jar file, containing a list of the per-region # assets jar_path = util.build_final_jar(version_id, top_dir, ad_all) # Upload compiler jar file if args.release: util.upload_local_file(jar_path, project, folder)
def clone_asset(record_id, regions, num_retries=0, priority=None): """ This function will attempt to clone the given record into all of the given regions. It will return a dictionary with the regions as keys and the record-ids of the corresponding asset as the values. If an asset is not able to be created in a given region, the value will be set to None. """ # Get the asset record record = dxpy.DXRecord(record_id) fid = record.get_details()['archiveFileId']['$dnanexus_link'] curr_region = dxpy.describe(record.project)['region'] # Only run once per region regions = set(regions) - set([curr_region]) app_supported_regions = set( CLONE_ASSET_APP.describe()['regionalOptions'].keys()) if len(regions - app_supported_regions) > 0: print('Currently no support for the following region(s): [{0}]'.format( ', '.join(regions - app_supported_regions)), file=sys.stderr) sys.exit(1) # Get information about the asset record_name = record.name asset_properties = record.get_properties() asset_properties['cloned_from'] = record_id asset_file_name = dxpy.describe(fid)['name'] url = dxpy.DXFile(fid).get_download_url( preauthenticated=True, project=dxpy.DXFile.NO_PROJECT_HINT, duration=URL_DURATION)[0] # Fire off a clone process for each region pool = multiprocessing.Pool(len(regions)) manager = multiprocessing.Manager() q = manager.Queue() clone_asset_func = functools.partial(_clone_asset_into_region, record_name=record_name, q=q, asset_properties=asset_properties, asset_file_name=asset_file_name, url=url, num_retries=num_retries, priority=priority) results = pool.map_async(clone_asset_func, regions) # Get and return the results remaining_regions = regions print('Waiting on region(s): {0} '.format(' '.join(remaining_regions))) while True: if results.ready(): break else: if q.qsize() > 0: for i in xrange(q.qsize()): received = set([q.get()]) remaining_regions = remaining_regions - received print('\nWaiting on region(s): {0} '.format( ' '.join(remaining_regions))) sys.stdout.write('.') sys.stdout.flush() time.sleep(SLEEP_TIME) print('\nDone') results = results.get() record_ids = {} for result in results: for region in result: if result[region] is None: record_ids[region] = None else: record_ids[region] = result[region]['$dnanexus_link'] return record_ids
def upload_applet(src_dir, uploaded_resources, check_name_collisions=True, overwrite=False, archive=False, project=None, override_folder=None, override_name=None, dx_toolkit_autodep="stable", dry_run=False, **kwargs): """ Creates a new applet object. :param project: ID of container in which to create the applet. :type project: str, or None to use whatever is specified in dxapp.json :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_folder: str :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json :type override_name: str :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency. :type dx_toolkit_autodep: boolean or string """ applet_spec = _get_applet_spec(src_dir) if project is None: dest_project = applet_spec['project'] else: dest_project = project applet_spec['project'] = project if 'name' not in applet_spec: try: applet_spec['name'] = os.path.basename(os.path.abspath(src_dir)) except: raise AppBuilderException( "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)" % (src_dir, )) if override_folder: applet_spec['folder'] = override_folder if 'folder' not in applet_spec: applet_spec['folder'] = '/' if override_name: applet_spec['name'] = override_name if 'dxapi' not in applet_spec: applet_spec['dxapi'] = dxpy.API_VERSION applets_to_overwrite = [] archived_applet = None if check_name_collisions and not dry_run: destination_path = applet_spec['folder'] + ( '/' if not applet_spec['folder'].endswith('/') else '') + applet_spec['name'] logger.debug("Checking for existing applet at " + destination_path) for result in dxpy.find_data_objects(classname="applet", name=applet_spec["name"], folder=applet_spec['folder'], project=dest_project, recurse=False): if overwrite: # Don't remove the old applet until after the new one # has been created. This avoids a race condition where # we remove the old applet, but that causes garbage # collection of the bundled resources that will be # shared with the new applet applets_to_overwrite.append(result['id']) elif archive: logger.debug("Archiving applet %s" % (result['id'])) proj = dxpy.DXProject(dest_project) archive_folder = '/.Applet_archive' try: proj.list_folder(archive_folder) except dxpy.DXAPIError: proj.new_folder(archive_folder) proj.move(objects=[result['id']], destination=archive_folder) archived_applet = dxpy.DXApplet(result['id'], project=dest_project) now = datetime.datetime.fromtimestamp(archived_applet.created / 1000).ctime() new_name = archived_applet.name + " ({d})".format(d=now) archived_applet.rename(new_name) logger.info( "Archived applet %s to %s:\"%s/%s\"" % (result['id'], dest_project, archive_folder, new_name)) else: raise AppBuilderException( "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given" % (destination_path, result['id'])) # ----- # Override various fields from the pristine dxapp.json # Carry region-specific values from regionalOptions into the main # runSpec applet_spec["runSpec"].setdefault("bundledDepends", []) applet_spec["runSpec"].setdefault("assetDepends", []) if not dry_run: region = dxpy.api.project_describe( dest_project, input_params={"fields": { "region": True }})["region"] # if regionalOptions contain at least one region, they must include # the region of the target project if len(applet_spec.get('regionalOptions', {})) != 0 and region not in applet_spec.get( 'regionalOptions', {}): err_mesg = "destination project is in region {} but \"regionalOptions\" do not contain this region. ".format( region) err_mesg += "Please, update your \"regionalOptions\" specification" raise AppBuilderException(err_mesg) regional_options = applet_spec.get('regionalOptions', {}).get(region, {}) # We checked earlier that if region-specific values for the # fields below are given, the same fields are not also specified # in the top-level runSpec. So the operations below should not # result in any user-supplied settings being clobbered. if 'systemRequirements' in regional_options: applet_spec["runSpec"]["systemRequirements"] = regional_options[ 'systemRequirements'] if 'bundledDepends' in regional_options: applet_spec["runSpec"]["bundledDepends"].extend( regional_options["bundledDepends"]) if 'assetDepends' in regional_options: applet_spec["runSpec"]["assetDepends"].extend( regional_options["assetDepends"]) # Inline Readme.md and Readme.developer.md dxpy.executable_builder.inline_documentation_files(applet_spec, src_dir) # Inline the code of the program if "file" in applet_spec["runSpec"]: # Put it into runSpec.code instead with open(os.path.join(src_dir, applet_spec["runSpec"]["file"])) as code_fh: applet_spec["runSpec"]["code"] = code_fh.read() del applet_spec["runSpec"]["file"] # If this is applet requires a cluster, inline any bootstrapScript code that may be provided. # bootstrapScript is an *optional* clusterSpec parameter. # NOTE: assumes bootstrapScript is always provided as a filename if "systemRequirements" in applet_spec["runSpec"]: sys_reqs = applet_spec["runSpec"]["systemRequirements"] for entry_point in sys_reqs: try: bootstrap_script = os.path.join( src_dir, sys_reqs[entry_point]["clusterSpec"]["bootstrapScript"]) with open(bootstrap_script) as code_fh: sys_reqs[entry_point]["clusterSpec"][ "bootstrapScript"] = code_fh.read() except KeyError: # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec" continue except IOError: raise AppBuilderException( "The clusterSpec \"bootstrapScript\" could not be read.") # Attach bundled resources to the app if uploaded_resources is not None: applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources) # Validate and process assetDepends asset_depends = applet_spec["runSpec"]["assetDepends"] if type(asset_depends) is not list or any( type(dep) is not dict for dep in asset_depends): raise AppBuilderException( "Expected runSpec.assetDepends to be an array of objects") for asset in asset_depends: asset_project = asset.get("project", None) asset_folder = asset.get("folder", '/') asset_stages = asset.get("stages", None) if "id" in asset: asset_record = dxpy.DXRecord(asset["id"]).describe( fields={'details'}, default_fields=True) elif "name" in asset and asset_project is not None and "version" in asset: try: asset_record = dxpy.find_one_data_object( zero_ok=True, classname="record", typename="AssetBundle", name=asset["name"], properties=dict(version=asset["version"]), project=asset_project, folder=asset_folder, recurse=False, describe={ "defaultFields": True, "fields": { "details": True } }, state="closed", more_ok=False) except dxpy.exceptions.DXSearchError: msg = "Found more than one asset record that matches: name={0}, folder={1} in project={2}." raise AppBuilderException( msg.format(asset["name"], asset_folder, asset_project)) else: raise AppBuilderException( "Each runSpec.assetDepends element must have either {'id'} or " "{'name', 'project' and 'version'} field(s).") if asset_record: if "id" in asset: asset_details = asset_record["details"] else: asset_details = asset_record["describe"]["details"] if "archiveFileId" in asset_details: archive_file_id = asset_details["archiveFileId"] else: raise AppBuilderException( "The required field 'archiveFileId' was not found in " "the details of the asset bundle %s " % asset_record["id"]) archive_file_name = dxpy.DXFile(archive_file_id).describe()["name"] bundle_depends = {"name": archive_file_name, "id": archive_file_id} if asset_stages: bundle_depends["stages"] = asset_stages applet_spec["runSpec"]["bundledDepends"].append(bundle_depends) # If the file is not found in the applet destination project, clone it from the asset project if (not dry_run and dxpy.DXRecord(dxid=asset_record["id"], project=dest_project).describe()["project"] != dest_project): dxpy.DXRecord( asset_record["id"], project=asset_record["project"]).clone(dest_project) else: raise AppBuilderException( "No asset bundle was found that matched the specification %s" % (json.dumps(asset))) # Include the DNAnexus client libraries as an execution dependency, if they are not already # there if dx_toolkit_autodep == "git": dx_toolkit_dep = { "name": "dx-toolkit", "package_manager": "git", "url": "git://github.com/dnanexus/dx-toolkit.git", "tag": "master", "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus" } elif dx_toolkit_autodep == "stable": dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"} elif dx_toolkit_autodep: raise AppBuilderException( "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead" % (dx_toolkit_autodep, )) if dx_toolkit_autodep: applet_spec["runSpec"].setdefault("execDepends", []) exec_depends = applet_spec["runSpec"]["execDepends"] if type(exec_depends) is not list or any( type(dep) is not dict for dep in exec_depends): raise AppBuilderException( "Expected runSpec.execDepends to be an array of objects") dx_toolkit_dep_found = any( dep.get('name') in DX_TOOLKIT_PKGS or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends) if not dx_toolkit_dep_found: exec_depends.append(dx_toolkit_dep) if dx_toolkit_autodep == "git": applet_spec.setdefault("access", {}) applet_spec["access"].setdefault("network", []) # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps if "*" not in applet_spec["access"]["network"]: applet_spec["access"]["network"].append("*") merge(applet_spec, kwargs) # ----- # Now actually create the applet if dry_run: print("Would create the following applet:") print(json.dumps(applet_spec, indent=2)) print("*** DRY-RUN-- no applet was created ***") return None, None if applet_spec.get("categories", []): if "tags" not in applet_spec: applet_spec["tags"] = [] applet_spec["tags"] = list( set(applet_spec["tags"]) | set(applet_spec["categories"])) applet_id = dxpy.api.applet_new(applet_spec)["id"] if archived_applet: archived_applet.set_properties({'replacedWith': applet_id}) # Now it is permissible to delete the old applet(s), if any if applets_to_overwrite: logger.info("Deleting applet(s) %s" % (','.join(applets_to_overwrite))) dxpy.DXProject(dest_project).remove_objects(applets_to_overwrite) return applet_id, applet_spec
def main(**job_inputs): print "Beginning processing of RNA data" output = {} check_reads(job_inputs['reads']) # Convert reads tables to FASTQ/FASTA files left_reads = [] right_reads = [] current_reads = 0 for reads in job_inputs['reads']: print "Converting reads table " + str(reads['$dnanexus_link']) left, right = dump_fastqa(reads['$dnanexus_link'], "reads_" + str(current_reads)) left_reads.append(left) if right != None: right_reads.append(right) current_reads += 1 # Convert Genes Object to GFF file run_shell("dx-genes-to-gtf --output genes.gtf " + job_inputs['gene_model']['$dnanexus_link']) # Create or download indexed genome genome = dxpy.DXRecord(job_inputs['reference']) if not 'indexed_reference' in job_inputs: output['indexed_reference'] = dxpy.dxlink( make_indexed_reference(genome.get_id())) else: output['indexed_reference'] = job_inputs['indexed_reference'] indexed_genome = dxpy.DXRecord(job_inputs['indexed_reference']) dxpy.download_dxfile(indexed_genome.get_details()['index_archive'], "reference.tar.xz") run_shell("tar -xJf reference.tar.xz") # call tophat num_cpus = multiprocessing.cpu_count() cmd = " ".join([ 'tophat', "-p", str(num_cpus), job_inputs['tophat_options'], "-G genes.gtf", "--transcriptome-index=./genes", "-T", "indexed_ref", " ", ",".join(left_reads) ]) if len(right_reads) != 0: cmd += " " + ",".join(right_reads) # Invoke tophat2 with FASTQ/A file(s) and indexed reference try: run_shell(cmd) except: raise dxpy.AppError( "Error while running Tophat. This could be caused by an incompatible gene model and reference or incorrect optional parameters. Please check that these are all correct" ) # upload and import the BAM as a Mappings table accepted_hits_file = dxpy.upload_local_file('tophat_out/accepted_hits.bam', wait_on_close=True) name = job_inputs['output_name'] name += "_mappings" sam_importer = dxpy.DXApp(name="sam_importer") print "Importing BAM output of Tophat" import_job = sam_importer.run({ "file": dxpy.dxlink(accepted_hits_file.get_id()), "reference_genome": dxpy.dxlink(genome.get_id()), "name": name }) cuff_cmd = " ".join( ['cufflinks', '-p', str(num_cpus), '-G genes.gtf', '-o cuff']) if 'cufflinks_options' in job_inputs: cuff_cmd += " " + job_inputs['cufflinks_options'] cuff_cmd += " tophat_out/accepted_hits.bam" # now with mapped reads in hand we can run cufflinks try: run_shell(cuff_cmd) except: raise dxpy.AppError( "Error while running Cufflinks. Please check that your parameters are valid" ) print "Packing, uploading, and parsing cufflinks output" # package cufflinks output run_shell("tar -czf cufflinks_output.tar.gz cuff/") cuff_name = job_inputs['output_name'] + "_cufflinks_output.tar.gz" orig_trans_file = dxpy.upload_local_file("cufflinks_output.tar.gz") orig_trans_file.rename(cuff_name) transcripts_table = upload_transcripts_file('cuff/genes.fpkm_tracking', job_inputs['output_name']) output['mappings'] = {"job": import_job.get_id(), "field": "mappings"} output['transcripts'] = dxpy.dxlink(transcripts_table.get_id()) output['cufflinks_output'] = dxpy.dxlink(orig_trans_file.get_id()) print "DONE!" return output