def run_split_reads_if_needed(job, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids): """ Return a list of lists of read chunk file IDs, one list per read files. If the workflow is in single_reads_chunk mode (according to context.options.single_read_chunk), produce one chunk per file. Otherwise, produce several chunks per file. """ if not context.config.single_reads_chunk: reads_chunk_ids = job.addChildJobFn( run_split_reads, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info( "Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in reads_file_ids] return reads_chunk_ids
def run_split_gam_reads(job, context, gam_input_reads, gam_reads_file_id): """ split up an input reads file in GAM format """ RealtimeLogger.info("Starting gam split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment gam_path = os.path.join(work_dir, os.path.basename(gam_input_reads)) job.fileStore.readGlobalFile(gam_reads_file_id, gam_path) # Split up the gam into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 cmd = ['vg', 'chunk', '-a', os.path.basename(gam_path), '--gam-split-size', str(chunk_size), '--prefix', 'gam_reads_chunk'] context.runner.call(job, cmd, work_dir = work_dir) gam_chunk_ids = [] for chunk_name in os.listdir(work_dir): if chunk_name.endswith('.gam') and chunk_name.startswith('gam_reads_chunk'): gam_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info("Split gam into {} chunks. Process took {} seconds.".format(len(gam_chunk_ids), run_time)) return gam_chunk_ids
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved, xg_file_id, paths): """ split the fastq, then surject each chunk. returns outputgams, paired with total surject time (excluding toil-vg overhead such as transferring and splitting files )""" # to encapsulate everything under this job child_job = Job() job.addChild(child_job) if not context.config.single_reads_chunk: reads_chunk_ids = child_job.addChildJobFn( run_split_reads, context, None, 'aln.gam', None, [gam_input_reads_id], cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info( "Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in [gam_input_reads_id]] return child_job.addFollowOnJobFn(run_whole_surject, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def minigraph_map_all(job, config, gfa_id, fa_id_map): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) # do the mapping gaf_ids = [] for event, fa_id in fa_id_map.items(): RealtimeLogger.info("adding child event={} faid={} gfaid={}".format( event, fa_id, gfa_id)) minigraph_map_job = top_job.addChildJobFn(minigraph_map_one, config, event, fa_id, gfa_id, cores=1, disk=5 * (fa_id.size + gfa_id.size)) gaf_ids.append(minigraph_map_job.rv()) # convert to paf paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids) return paf_job.rv()
def get_sfam_ddi_sizes(job, sfam_id, observed=True): int_type = "observed" if observed else "inferred" work_dir = job.fileStore.getLocalTempDir() interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces") interfaces_key = "{s}/{s}.{o}_interactome".format( s=sfam_id, o="observed" if observed else "inferred") interfaces_file = os.path.basename(interfaces_key) interface_store.read_input_file(interfaces_key, interfaces_file) interfaces = pd.read_hdf(interfaces_file, "table") RealtimeLogger.info("COLS: {}".format(interfaces.columns)) counts = interfaces.fillna(-1.).groupby( ["mol_superfam_id", "int_superfam_id"]).size().reset_index(name="count") RealtimeLogger.info("SIZES :{}".format(counts)) try: os.remove(interfaces_file) except OSError: pass return counts
def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'): # Wait a random amount of time before grabbing the file for others to cache it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) # Read the file. Don't accept a symlink because then we might just have the # filestore's copy, even if caching is not happening. local_file = job.fileStore.readGlobalFile(file_id, cache=True, mutable=False, symlink=False) # Wait a random amount of after before grabbing the file for others to use it time.sleep(random.randint(options.minSleep, options.minSleep + 10)) # Stat the file (reads through links) stats = os.stat(local_file) # Check what machine we are hostname = socket.gethostname() RealtimeLogger.info( 'Job {} on host {} sees file at device {} inode {}'.format( number, hostname, stats.st_dev, stats.st_ino)) # Return a tuple representing our view of the file. # Drop hostname since hostnames are unique per pod. return (stats.st_dev, stats.st_ino)
def best_sfams(job, all_counts, max_sfams=300): import json work_dir = job.fileStore.getLocalTempDir() out_store = IOStore.get("aws:us-east-1:molmimic-ddi") #Merge into one dataframe counts = pd.concat(all_counts) #mol->int should be same as int->mol: remove dupes ddi_counts = {} for counts in all_counts: for row in counts.itertuples(): ddi = tuple( map(int, sorted((row.mol_superfam_id, row.int_superfam_id)))) if ddi in ddi_counts: RealtimeLogger.info("{} {}, are counts symmetrical? {}".format( ddi[0], ddi[1], "Yes" if ddi_counts[ddi] == row.count else "No")) continue ddi_counts[ddi] = row.count sfams = sorted(ddi_counts.iteritems(), key=lambda x: x[1], reverse=True) RealtimeLogger.info("sfams is {}".format(sfams)) sfam_file = os.path.join(work_dir, "sorted_sfams.json") with open(sfam_file, "w") as f: json.dump(sfams, f) out_store.write_output_file(sfam_file, "sorted_sfams.json") return sfams[:max_sfams]
def process_sfam(job, sfam_id, pdbFileStoreID, cores=1): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-full-structures".format(prefix)) sdoms_file = copy_pdb_h5(job, pdbFileStoreID) sdoms = pd.read_hdf(unicode(sdoms_file), "merged") #, where="sfam_id == {}".format(sfam_id)) # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv") # if os.path.isfile(skip_file): # skip = pd.read_csv(skip_file) # sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])] sdoms = sdoms[sdoms["sfam_id"] == float( sfam_id)]["sdi"].drop_duplicates().dropna() #sdoms = sdoms[:1] if cores > 2: #Only makes sense for slurm or other bare-matal clsuters setup_dask(cores) d_sdoms = dd.from_pandas(sdoms, npartitions=cores) RealtimeLogger.info("Running sfam dask {}".format(sdoms)) processed_domains = d_sdoms.apply( lambda row: process_domain(job, row.sdi, sdoms_file), axis=1).compute() else: processed_domains = job.addChildJobFn(map_job_rv, process_domain, sdoms, pdbFileStoreID, preemptable=True).rv() return processed_domains
def run_split_fastq(job, context, fastq, fastq_i, sample_fastq_id): RealtimeLogger.info("Starting fastq split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment fastq_name = os.path.basename(fastq[fastq_i]) fastq_path = os.path.join(work_dir, fastq_name) fastq_gzipped = os.path.splitext(fastq_name)[1] == '.gz' fastq_name = os.path.splitext(fastq_name)[0] if fastq_gzipped: fastq_name = os.path.splitext(fastq_name)[0] job.fileStore.readGlobalFile(sample_fastq_id, fastq_path) # Split up the fastq into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 # 4 lines per read chunk_lines = chunk_size * 4 # Note we do this on the command line because Python is too slow if fastq_gzipped: cmd = [['gzip', '-d', '-c', os.path.basename(fastq_path)]] else: cmd = [['cat', os.path.basename(fastq_path)]] cmd.append([ 'split', '-l', str(chunk_lines), '--filter=pigz -p {} > $FILE.fq.gz'.format( max(1, int(context.config.fq_split_cores) - 1)), '-', '{}-chunk.'.format(fastq_name) ]) context.runner.call(job, cmd, work_dir=work_dir, tool_name='pigz') fastq_chunk_ids = [] for chunk_name in sorted(os.listdir(work_dir)): if chunk_name.endswith('.fq.gz') and chunk_name.startswith( '{}-chunk'.format(fastq_name)): fastq_chunk_ids.append( context.write_intermediate_file( job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info( "Split fastq into {} chunks. Process took {} seconds.".format( len(fastq_chunk_ids), run_time)) return fastq_chunk_ids
def read_input_file(self, input_path, local_path): """ Get input from the filesystem. """ RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format( input_path, self.path_prefix, local_path)) if os.path.exists(local_path): # Try deleting the existing item if it already exists try: os.unlink(local_path) except: # Don't fail here, fail complaining about the assertion, which # will be more informative. pass # Make sure the path is clear for copying assert (not os.path.exists(local_path)) # Where is the file actually? real_path = os.path.abspath(os.path.join(self.path_prefix, input_path)) if not os.path.exists(real_path): RealtimeLogger.error( "Can't find {} from FileIOStore in {}!".format( input_path, self.path_prefix)) raise RuntimeError("File {} missing!".format(real_path)) # Make a temporary file temp_handle, temp_path = tempfile.mkstemp( dir=os.path.dirname(local_path)) os.close(temp_handle) # Copy to the temp file shutil.copy2(real_path, temp_path) # Rename the temp file to the right place, atomically RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path)) os.rename(temp_path, local_path) # Look at the file stats file_stats = os.stat(real_path) if (file_stats.st_uid == os.getuid() and file_stats.st_mode & stat.S_IWUSR): # We own this file and can write to it. We don't want the user # script messing it up through the symlink. try: # Clear the user write bit, so the user can't accidentally # clobber the file in the actual store through the symlink. os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR) except OSError: # If something goes wrong here (like us not having permission to # change permissions), ignore it. pass
def ensure_disk(job, job_fn, job_fn_args, job_fn_kwargs, file_id_list, factor=8, padding=1024**3): """ Ensure that the currently running job has enough disk to load all the given file IDs (passed as any number of lists of file IDs), and process them, producing factor times as much data, plus padding. Takes the job, the function that is the job, the list of arguments passed in (except the job object), the dict of keyword args passed in, and then a file ID list or iterable. If there is not enough disk, re-queues the job with more disk, and returns the promise for the result. If there is enough disk, returns None TODO: Convert to promised requirements if it is sufficiently expressive. """ # We need to compute the total size of our inputs, expected intermediates, # and outputs, and re-queue ourselves if we don't have enough disk. required_disk = 0 for file_id in file_id_list: # For each file in the collection # Say we need space for it required_disk += file_id.size # Multiply out for intermediates and results # TODO: Allow different factors for different file IDs # We only need to multiply e.g. BAM files, not indexes required_disk *= factor # Add some padding required_disk += padding if job.disk < required_disk: # Re-queue with more disk RealtimeLogger.info( "Re-queueing job with {} bytes of disk; originally had {}".format( required_disk, job.disk)) requeued = job.addChildJobFn(job_fn, *job_fn_args, cores=job.cores, memory=job.memory, disk=required_disk, **job_fn_kwargs) return requeued.rv() else: # Disk we have is OK return None
def gcsa_index_job(job, options, vg_ids, primary_path_names=None): """ Index the given graphs into a GCSA/LCP index, and return a pair of file IDs for the GCSA and the LCP files. Will prune the graph before indexing unless options.prune_opts is explicitly set as an empty list. """ # Do any options manipulation we need to do # Strip out stuff we don't want and apply config defaults options = sanitize_options(options) # Add the outstore, which we have sort of disabled. It insists on writing # stuff, so just drop it in the current directory. It doesn't read it back. options.out_store = "file:." # Don't use outstore instead of the file store options.force_outstore = False # Pretend we're the pipeline tool options.tool = "pipeline" # Add stuff that toil vg index uses # options.graphs has to have a name for every graph, to save it under in the # local temp dir. options.graphs = ["graph{}".format(i) for i in xrange(len(vg_ids))] # We also need a "chroms" giving the primary path for each graph. It's OK if # the path doesn't exist in a given graph, but if it does it will be added # to the index. # We have primary path names to use. We can just try and retain all ther # paths in all graphs. RealtimeLogger.info("Want to GCSA-index {} with paths {}".format( vg_ids, primary_path_names)) # Fake path names options.chroms = ["fake{}".format(i) for i in xrange(len(vg_ids))] # options.index_name has to have the basename for the .gcsa in the local # temp dir. options.index_name = "gcsaindex" return job.addChildJobFn(toil_vg.vg_index.run_gcsa_prep, options, vg_ids, primary_path_override=primary_path_names, cores=options.misc_cores, memory=options.misc_mem, disk=options.misc_disk).rv()
def call_with_singularity(self, job, args, work_dir, outfile, errfile, check_output, tool_name, mount_list): """ Thin wrapper for singularity_call that will use internal lookup to figure out the location of the singularity file. Only exposes singularity_call parameters used so far. expect args as list of lists. if (toplevel) list has size > 1, then piping interface used """ RealtimeLogger.info(truncate_msg("Singularity Run: {}".format(" | ".join(" ".join(x) for x in args)))) start_time = timeit.default_timer() # we use the first argument to look up the tool in the singularity map # but allow overriding of this with the tool_name parameter name = tool_name if tool_name is not None else args[0][0] tool = self.docker_tool_map[name] parameters = args[0] if len(args) == 1 else args # Get a lock on the environment global environment_lock with environment_lock: # TODO: We can't stop other threads using os.environ or subprocess or w/e on their own # Set the locale to C for consistent sorting, and activate vg traceback update_env = {'LC_ALL' : 'C', 'VG_FULL_TRACEBACK': '1'} if name == 'Rscript': # The R dockers by default want to install packages in non-writable directories. Sometimes. # Make sure a writable directory which exists is used. update_env['R_LIBS']='/tmp' old_env = {} for env_name, env_val in list(update_env.items()): old_env[env_name] = os.environ.get(env_name) os.environ[env_name] = env_val if check_output is True: ret = singularityCheckOutput(job, tool, parameters=parameters, workDir=work_dir, mount_list=mount_list) else: ret = singularityCall(job, tool, parameters=parameters, workDir=work_dir, outfile = outfile, mount_list=mount_list) # Restore old locale and vg traceback for env_name, env_val in list(update_env.items()): if old_env[env_name] is not None: os.environ[env_name] = old_env[env_name] else: del os.environ[env_name] end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info("Successfully singularity ran {} in {} seconds.".format( " | ".join(" ".join(x) for x in args), run_time)) if outfile: outfile.flush() os.fsync(outfile.fileno()) return ret
def setup(job, inputFile, N, downCheckpoints, options): """ Sets up the sort. Returns the FileID of the sorted file """ RealtimeLogger.info("Starting the merge sort") return job.addChildJobFn(down, inputFile, N, 'root', downCheckpoints, options = options, preemptable=True, memory=sortMemory).rv()
def list_input_directory(self, input_path, recursive=False, with_times=False): """ Loop over directories on the filesystem. """ RealtimeLogger.info("Enumerating {} from " "FileIOStore in {}".format(input_path, self.path_prefix)) if not os.path.exists(os.path.join(self.path_prefix, input_path)): # Nothing to list over return if not os.path.isdir(os.path.join(self.path_prefix, input_path)): # Can't list a file, only a directory. return for item in os.listdir(os.path.join(self.path_prefix, input_path)): if (recursive and os.path.isdir( os.path.join(self.path_prefix, input_path, item))): # We're recursing and this is a directory. # Recurse on this. for subitem in self.list_input_directory( os.path.join(input_path, item), recursive): # Make relative paths include this directory name and yield # them name_to_yield = os.path.join(item, subitem) if with_times: # What is the mtime in seconds since epoch? mtime_epoch_seconds = os.path.getmtime( os.path.join(input_path, item, subitem)) # Convert it to datetime yield name_to_yield, mtime_epoch_seconds else: yield name_to_yield else: # This isn't a directory or we aren't being recursive # Just report this individual item. if with_times: # What is the mtime in seconds since epoch? mtime_epoch_seconds = os.path.getmtime( os.path.join(input_path, item)) yield item, mtime_epoch_seconds else: yield item
def convert_job(job, options, sam_url, bam_id): """ Subset and convert BAM to FASTQ pair. Returns FASTQ IDs. """ # We have to deal with relative paths relative to here if we want Docker to # work right work_dir = job.fileStore.getLocalTempDir() # Read the BAM back, into the work_dir sorted_bam = "sorted.bam" job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, sorted_bam)) RealtimeLogger.info("Subset {} to SAM".format(sam_url)) # Then stream to SAM and select just the reads we want. This file is used by # this Python code as a soy-based FIFO substitute, and so doesn't need to be # in work_dir. subset_sam = job.fileStore.getLocalTempDir() + "/subset.sam" # We start out with just a view pipeline sam_command = [["samtools", "view", sorted_bam]] if options.contig is not None: # Subset to this contig and related alts with awk sam_command.append([ "awk", ("{if ($3 ~ /" + options.contig + "(_.*)?$/ || $7 ~ /" + options.contig + "(_.*)?$/) print}") ]) options.drunner.call(job, sam_command, outfile=open(subset_sam, "w"), work_dir=work_dir) RealtimeLogger.info("Convert {} to FASTQ".format(sam_url)) with job.fileStore.writeGlobalFileStream() as (fq1_handle, fq1_id): with job.fileStore.writeGlobalFileStream() as (fq2_handle, fq2_id): # Then prep options for running the converter script in this Python convert_options = argparse.Namespace() convert_options.input_sam = open(subset_sam, "r") convert_options.fq1 = fq1_handle convert_options.fq2 = fq2_handle convert_options.drop_secondary = True convert_options.expect_paired = True convert_options.interleaved = False smartSam2Fastq.run(convert_options) return fq1_id, fq2_id
def run_split_bam_reads(job, context, bam_input_reads, bam_reads_file_id): """ split up an input reads file in BAM format """ RealtimeLogger.info("Starting bam split") start_time = timeit.default_timer() # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # We need the sample fastq for alignment bam_path = os.path.join(work_dir, os.path.basename(bam_input_reads)) job.fileStore.readGlobalFile(bam_reads_file_id, bam_path) # Split up the bam into chunks # Make sure chunk size even in case paired interleaved chunk_size = context.config.reads_per_chunk if chunk_size % 2 != 0: chunk_size += 1 # 1 line per read chunk_lines = chunk_size * 1 cmd = [['samtools', 'view', os.path.basename(bam_path)]] cmd.append([ 'split', '-l', str(chunk_lines), '--filter=bash -c \'cat <(samtools view -H {}) <(cat -)'.format( os.path.basename(bam_path)) + ' | samtools view -O BAM --threads {} -'.format( max(1, int(context.config.fq_split_cores) - 1)) + ' > $FILE.bam\'', '-', 'bam_reads_chunk.' ]) context.runner.call(job, cmd, work_dir=work_dir) bam_chunk_ids = [] for chunk_name in sorted(os.listdir(work_dir)): if chunk_name.endswith('.bam') and chunk_name.startswith( 'bam_reads_chunk'): bam_chunk_ids.append( context.write_intermediate_file( job, os.path.join(work_dir, chunk_name))) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info( "Split bam into {} chunks. Process took {} seconds.".format( len(bam_chunk_ids), run_time)) return bam_chunk_ids
def start_toil(job): print "Starting job" work_dir = job.fileStore.getLocalTempDir() in_store = IOStore.get("aws:us-east-1:molmimic-ibis") int_store = IOStore.get("aws:us-east-1:molmimic-interfaces") #Download PDB info pdb_file = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_file) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file) #Download PDB Taxonomy information tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5") in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file) #Add tax info into local job store taxFileStoreID = job.fileStore.writeGlobalFile(tax_file) tables = set(range(1,87))-set([51]) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values() #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0])) sfamFileStoreIDs = {} for s in sfams: k = "{0}/{0}.observed_interactome".format(int(s)) if int_store.exists(k): RealtimeLogger.info("Loading {}".format(s)) f = job.fileStore.getLocalTempFileName() int_store.read_input_file(k, f) sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f) os.remove(f) else: RealtimeLogger.info("FAILED Loading {}".format(s)) assert len(sfamFileStoreIDs) > 0 os.remove(tax_file) os.remove(pdb_file) job.log("Running tables: {}".format(tables)) j = job for table in table: j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) # map_job(job, get_inferred_structural_interactome_by_table, tables, # pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths): """ Surject all gam chunks in parallel. surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name. If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output. Surjects against the given collection of paths in the given XG file. """ RealtimeLogger.info( "Surjecting read chunks {} to BAM".format(reads_chunk_ids)) # this will be a list of lists. # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) bam_chunk_file_ids = [] bam_chunk_running_times = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph surject on each gam chunk chunk_surject_job = child_job.addChildJobFn( run_chunk_surject, context, interleaved, xg_file_id, paths, chunk_filename_ids, '{}_chunk{}'.format(output_name, chunk_id), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chunk_file_ids.append(chunk_surject_job.rv(0)) bam_chunk_running_times.append(chunk_surject_job.rv(1)) return child_job.addFollowOnJobFn(run_merge_bams, context, output_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def run_concat_files(job, context, file_ids, dest_name=None, header=None): """ Utility job to concatenate some files. Returns the concatenated file ID. If given a dest_name, writes the result to the out store with the given name. (We wanted to use name, but that kwarg is reserved by Toil.) If given a header, prepends the header to the file with a trailing newline. """ requeue_promise = ensure_disk(job, run_concat_files, [context, file_ids], { "dest_name": dest_name, "header": header }, file_ids, factor=2) if requeue_promise is not None: # We requeued ourselves with more disk to accomodate our inputs return requeue_promise work_dir = job.fileStore.getLocalTempDir() out_name = os.path.join(work_dir, 'output.dat' if dest_name is None else dest_name) # Concatenate all the files # TODO: We don't use the trick where we append to the first file to save a copy. Should we? with open(out_name, 'w') as out_file: if header is not None: # Put the header if specified out_file.write(header + '\n') for file_id in file_ids: with job.fileStore.readGlobalFileStream(file_id) as in_file: # Then beam over each file shutil.copyfileobj(in_file, out_file) if dest_name is None: # Send back an intermediate file RealtimeLogger.info( "Concatenated {} files into intermediate file {}".format( len(file_ids), out_name)) return context.write_intermediate_file(job, out_name) else: # Write to outstore under the given name. RealtimeLogger.info( "Concatenated {} files into output file {} -> {}".format( len(file_ids), out_name, dest_name)) return context.write_output_file(job, out_name, dest_name)
def calculate_features_for_atom(self, atom, only_aa=False, only_atom=False, non_geom_features=False, use_deepsite_features=False, warn_if_buried=False): if use_deepsite_features: features = self.get_deepsite_features(atom) if warn_if_buried: is_burried = self.get_accessible_surface_area(atom)[-2] elif only_atom: features = self.get_element_type(atom) if warn_if_buried: is_buried = self.get_accessible_surface_area(atom)[-2] elif only_aa: features = self.get_residue(atom) if warn_if_buried: is_buried = self.get_accessible_surface_area(atom)[-2] elif non_geom_features: features = np.zeros(13) features[0:5] = self.get_element_type(atom) features[5:9] = self.get_charge_and_electrostatics(atom) features[9:13] = self.get_hydrophobicity(atom) is_buried = self.get_accessible_surface_area(atom)[-2] else: features = np.empty(self.n_atom_features) features[0:13] = self.get_atom_type(atom) features[13:18] = self.get_element_type(atom) features[18:19] = self.get_vdw(atom) features[19:26] = self.get_charge_and_electrostatics(atom) features[26:30] = self.get_concavity(atom) features[30:34] = self.get_hydrophobicity(atom) features[34:40] = self.get_accessible_surface_area(atom) features[40:61] = self.get_residue(atom) features[61:64] = self.get_ss(atom) features[64:70] = self.get_deepsite_features(atom, calc_charge=False, calc_conservation=False) features[70:73] = self.get_evolutionary_conservation_score(atom) is_buried = bool(features[35]) RealtimeLogger.info("Finished atom {}".format(atom)) self.atom_features[atom.serial_number-1] = features if warn_if_buried: return features, is_buried else: return features
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory): """ Input is a file, a subdivision size N, and a path in the hierarchy of jobs. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ RealtimeLogger.info("Down job starting: %s" % path) # Read the file inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False) length = os.path.getsize(inputFile) if length > N: # We will subdivide the file RealtimeLogger.critical("Splitting file: %s of size: %s" % (inputFileStoreID, length)) # Split the file into two copies midPoint = getMidPoint(inputFile, 0, length) t1 = job.fileStore.getLocalTempFile() with open(t1, 'w') as fH: fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1)) t2 = job.fileStore.getLocalTempFile() with open(t2, 'w') as fH: fH.write(copySubRangeOfFile(inputFile, midPoint+1, length)) # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up, # we communicate the dependency without hindering concurrency. result = job.addFollowOnJobFn(up, job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0', downCheckpoints, checkpoint=downCheckpoints, options=options, preemptable=True, memory=options.sortMemory).rv(), job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1', downCheckpoints, checkpoint=downCheckpoints, options=options, preemptable=True, memory=options.mergeMemory).rv(), path + '/up', preemptable=True, options=options, memory=options.sortMemory).rv() else: # We can sort this bit of the file RealtimeLogger.critical("Sorting file: %s of size: %s" % (inputFileStoreID, length)) # Sort the copy and write back to the fileStore shutil.copyfile(inputFile, inputFile + '.sort') sort(inputFile + '.sort') result = job.fileStore.writeGlobalFile(inputFile + '.sort') RealtimeLogger.info("Down job finished: %s" % path) return result
def create_data_loader(job, sfam_id, preemptable=True): """Create H5 for Molmimic3dCNN to read Note: move this somewhere else """ work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] pdb_path = os.path.join(work_dir, "pdb") if not os.path.isdir(pdb_path): os.makedirs(pdb_path) id_format = re.compile( "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$") #Get all with keys same sfam, but do not download in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix)) keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \ if f.endswith(".pdb") and id_format.match(f)] pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily", str(int(sfam_id))) clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id))) try: pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \ for s in SeqIO.parse(clusters_file, "fasta")]) except ValueError: RealtimeLogger.info( "Unable to create data loading file for {}.".format(sfam_id)) return domains = pd.DataFrame({ "pdb": pdb, "chain": chain, "domNo": domain, "sdi": sdi }) data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id))) domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
def get_original_complexes(job, mol_sfam, int_sfam, group, int_type, work_dir=None): if work_dir is None: work_dir = os.cwd() complex_files = [] for _, row in group: row = row.iloc[0] RealtimeLogger.info("ROW: {}".format(row)) mol_file, mol_resi, int_file, int_resi = process_interface( job, row, int_type, work_dir=work_dir) # try: # mol_file = download_pdb(job, row.mol_superfam_id, row.mol_pdb, # row.mol_chain, row.mol_sdi_id, row.mol_domNo, work_dir=work_dir) # # int_file = download_pdb(job, row.int_superfam_id, row.int_pdb, # row.int_chain, row.int_sdi_id, row.int_domNo, work_dir=work_dir) # except (KeyboardInterrupt, SystemExit): # raise # except Exception as e: if mol_file is None or int_file is None: #PDB files not found, skip RealtimeLogger.info( "Cannot download PDB {}.{}.{} bc it was not found".format( row.mol_pdb, row.mol_chain, row.mol_sdi_id)) complex_files.append(None) continue merged_file = next( prep((mol_file, "M"), (int_file, "I"), merge=True, work_dir=work_dir)) complex_files.append(merged_file) return complex_files
def up(job, inputFileID1, inputFileID2, options, memory=sortMemory): """ Merges the two files and places them in the output. """ with job.fileStore.writeGlobalFileStream() as (fileHandle, outputFileStoreID): fileHandle = codecs.getwriter('utf-8')(fileHandle) with job.fileStore.readGlobalFileStream( inputFileID1) as inputFileHandle1: inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1) with job.fileStore.readGlobalFileStream( inputFileID2) as inputFileHandle2: inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2) RealtimeLogger.info( "Merging %s and %s to %s" % (inputFileID1, inputFileID2, outputFileStoreID)) merge(inputFileHandle1, inputFileHandle2, fileHandle) # Cleanup up the input files - these deletes will occur after the completion is successful. job.fileStore.deleteGlobalFile(inputFileID1) job.fileStore.deleteGlobalFile(inputFileID2) return outputFileStoreID
def copy_everything(job, options): """ Download the file list and copy all the files. """ # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) batch_count = 0 # List all the files. blobs_iterator = in_store.list_input_directory("", recursive=True) # Make an iterator that filters them filtered_iterator = (x for x in blobs_iterator if fnmatch.fnmatchcase(x, options.pattern)) # Batch them up for batch in group(filtered_iterator, options.batch_size): # For every batch, strip out any Nones that got put in when grouping batch = [x for x in batch if x is not None] # Copy everything in that batch job.addChildJobFn(copy_batch, options, batch, cores=1, memory="1G", disk="10G") batch_count += 1 if batch_count % 10 == 0: RealtimeLogger.info("Queued {} batches...".format(batch_count)) RealtimeLogger.info("Queued {} total batches".format(batch_count))
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") RealtimeLogger.info("Running table {}".format(table)) #Read in H5 for entire table tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store) tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath) sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), columns=["nbr_superfam_id"]).drop_duplicates().dropna() skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \ out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \ not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))]) # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ # if f.endswith(".inferred_interactome")]) sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)] sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist() # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \ # out_store.list_input_directory( # "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \ # sfam=sfam, table=table)) if not k.endswith("failed")) #sfams = list(set(sfams)-partial_sfams) if len(sfams) > 0: map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) try: os.remove(tableInfPath) except OSError: pass
def main_job(job, options, sam_urls): """ Root job of the Toil workflow. Download the sample URLs. Returns a Directory containing a bunch of output files. """ RealtimeLogger.info("Main job starting") RealtimeLogger.info("Temp directory location: {}".format( job.fileStore.getLocalTempDir())) # Make sure we can use samtools options.drunner.call(job, [["samtools", "--version"]]) # We'll fill this with promises for subdirectories by sample filename subdir_promises = {} for sam_url in sam_urls: # Work out the base filename sam_filename = os.path.basename(urlparse.urlparse(sam_url).path) # Go download and convert the reads, and stick the FASTQs in a directory subdir_promises[sam_filename] = ToilPromise.wrap( job.addChildJobFn(extract_job, options, sam_url, cores=1, memory="16G", disk="500G")).then( lambda (fq1, fq2): Directory({ "fq1.fastq": fq1, "fq2.fastq": fq2 })) # Mount each subdirectory under its original sam/bam/cram filename return ToilPromise.all(subdir_promises).then( lambda dirs: Directory().mount_all(dirs)).unwrap_result()
def extract_job(job, options, sam_url): """ Extract and fix up the given SAM/BAM/CRAM reads by URL. Return a pair of FASTQ file IDs. """ # We have to deal with relative paths relative to here if we want Docker to # work right work_dir = job.fileStore.getLocalTempDir() # Let's just download the whole bam sorted_bam = "sorted.bam" # We need a prefix for temp files temp_prefix = sorted_bam + ".part" RealtimeLogger.info("Sort {} to BAM".format(sam_url)) # Sort reads by name to a BAM file. If we don't give a temp file prefix it # tries to write the temp files back to the FTP. options.drunner.call(job, [[ "samtools", "sort", "-n", "-o", sorted_bam, "-T", temp_prefix, sam_url ]], work_dir=work_dir) # Save to file store bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, sorted_bam)) # Convert and return FASTQs return job.addChildJobFn(convert_job, options, sam_url, bam_id, cores=4, memory="16G", disk="500G").rv()
def download(filename): """ Download each file """ try: if (not options.overwrite) and out_store.exists(filename): # File exists. But make sure its size is correct. if not options.check_size: # Skip existing file. No need to check the length. RealtimeLogger.info("Skipped {}".format(filename)) return out_size = out_store.get_size(filename) in_size = in_store.get_size(filename) if out_size != in_size: # Complain about size mismatch and copy RealtimeLogger.warning( "Redownloading {}! Size was {} and not {}!".format( filename, out_size, in_size)) else: # Skip existing file RealtimeLogger.info("Skipped {}".format(filename)) return # Make a temp file (handle, path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir()) os.close(handle) RealtimeLogger.debug("Download {}".format(filename)) # Download in_store.read_input_file(filename, path) # Store out_store.write_output_file(path, filename) # Clean up os.unlink(path) except: # Put all exception text into an exception and raise that raise Exception("".join( traceback.format_exception(*sys.exc_info()))) RealtimeLogger.info("Copied {}".format(filename))
def run(self, fileStore): RealtimeLogger.info('This should be logged at info level') RealtimeLogger.debug('This should be logged at debug level')