Example #1
0
def run_split_reads_if_needed(job, context, fastq, gam_input_reads,
                              bam_input_reads, reads_file_ids):
    """
    Return a list of lists of read chunk file IDs, one list per read files.
    
    If the workflow is in single_reads_chunk mode (according to
    context.options.single_read_chunk), produce one chunk per file.
    
    Otherwise, produce several chunks per file.
    """

    if not context.config.single_reads_chunk:
        reads_chunk_ids = job.addChildJobFn(
            run_split_reads,
            context,
            fastq,
            gam_input_reads,
            bam_input_reads,
            reads_file_ids,
            cores=context.config.misc_cores,
            memory=context.config.misc_mem,
            disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info(
            "Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in reads_file_ids]

    return reads_chunk_ids
Example #2
0
def run_split_gam_reads(job, context, gam_input_reads, gam_reads_file_id):
    """ split up an input reads file in GAM format
    """
    RealtimeLogger.info("Starting gam split")
    start_time = timeit.default_timer()
    
    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    gam_path = os.path.join(work_dir, os.path.basename(gam_input_reads))
    job.fileStore.readGlobalFile(gam_reads_file_id, gam_path)

    # Split up the gam into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    cmd = ['vg', 'chunk', '-a', os.path.basename(gam_path), '--gam-split-size', str(chunk_size),
           '--prefix', 'gam_reads_chunk']

    context.runner.call(job, cmd, work_dir = work_dir)

    gam_chunk_ids = []
    for chunk_name in os.listdir(work_dir):
        if chunk_name.endswith('.gam') and chunk_name.startswith('gam_reads_chunk'):
            gam_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name)))
        
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info("Split gam into {} chunks. Process took {} seconds.".format(len(gam_chunk_ids), run_time))

    return gam_chunk_ids
Example #3
0
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved,
                   xg_file_id, paths):
    """ split the fastq, then surject each chunk.  returns outputgams, paired with total surject time
    (excluding toil-vg overhead such as transferring and splitting files )"""

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    if not context.config.single_reads_chunk:
        reads_chunk_ids = child_job.addChildJobFn(
            run_split_reads,
            context,
            None,
            'aln.gam',
            None, [gam_input_reads_id],
            cores=context.config.misc_cores,
            memory=context.config.misc_mem,
            disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info(
            "Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in [gam_input_reads_id]]

    return child_job.addFollowOnJobFn(run_whole_surject,
                                      context,
                                      reads_chunk_ids,
                                      output_name,
                                      interleaved,
                                      xg_file_id,
                                      paths,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
Example #4
0
def minigraph_map_all(job, config, gfa_id, fa_id_map):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    # do the mapping
    gaf_ids = []
    for event, fa_id in fa_id_map.items():
        RealtimeLogger.info("adding child event={} faid={} gfaid={}".format(
            event, fa_id, gfa_id))
        minigraph_map_job = top_job.addChildJobFn(minigraph_map_one,
                                                  config,
                                                  event,
                                                  fa_id,
                                                  gfa_id,
                                                  cores=1,
                                                  disk=5 *
                                                  (fa_id.size + gfa_id.size))
        gaf_ids.append(minigraph_map_job.rv())

    # convert to paf
    paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids)

    return paf_job.rv()
Example #5
0
def get_sfam_ddi_sizes(job, sfam_id, observed=True):
    int_type = "observed" if observed else "inferred"
    work_dir = job.fileStore.getLocalTempDir()
    interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    interfaces_key = "{s}/{s}.{o}_interactome".format(
        s=sfam_id, o="observed" if observed else "inferred")
    interfaces_file = os.path.basename(interfaces_key)
    interface_store.read_input_file(interfaces_key, interfaces_file)

    interfaces = pd.read_hdf(interfaces_file, "table")

    RealtimeLogger.info("COLS: {}".format(interfaces.columns))
    counts = interfaces.fillna(-1.).groupby(
        ["mol_superfam_id",
         "int_superfam_id"]).size().reset_index(name="count")

    RealtimeLogger.info("SIZES :{}".format(counts))

    try:
        os.remove(interfaces_file)
    except OSError:
        pass

    return counts
Example #6
0
def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'):

    # Wait a random amount of time before grabbing the file for others to cache it
    time.sleep(random.randint(options.minSleep, options.minSleep + 10))

    # Read the file. Don't accept a symlink because then we might just have the
    # filestore's copy, even if caching is not happening.
    local_file = job.fileStore.readGlobalFile(file_id,
                                              cache=True,
                                              mutable=False,
                                              symlink=False)

    # Wait a random amount of after before grabbing the file for others to use it
    time.sleep(random.randint(options.minSleep, options.minSleep + 10))

    # Stat the file (reads through links)
    stats = os.stat(local_file)

    # Check what machine we are
    hostname = socket.gethostname()

    RealtimeLogger.info(
        'Job {} on host {} sees file at device {} inode {}'.format(
            number, hostname, stats.st_dev, stats.st_ino))

    # Return a tuple representing our view of the file.
    # Drop hostname since hostnames are unique per pod.
    return (stats.st_dev, stats.st_ino)
Example #7
0
def best_sfams(job, all_counts, max_sfams=300):
    import json
    work_dir = job.fileStore.getLocalTempDir()
    out_store = IOStore.get("aws:us-east-1:molmimic-ddi")

    #Merge into one dataframe
    counts = pd.concat(all_counts)

    #mol->int should be same as int->mol: remove dupes
    ddi_counts = {}
    for counts in all_counts:
        for row in counts.itertuples():
            ddi = tuple(
                map(int, sorted((row.mol_superfam_id, row.int_superfam_id))))
            if ddi in ddi_counts:
                RealtimeLogger.info("{} {}, are counts symmetrical? {}".format(
                    ddi[0], ddi[1],
                    "Yes" if ddi_counts[ddi] == row.count else "No"))
                continue
            ddi_counts[ddi] = row.count

    sfams = sorted(ddi_counts.iteritems(), key=lambda x: x[1], reverse=True)
    RealtimeLogger.info("sfams is {}".format(sfams))
    sfam_file = os.path.join(work_dir, "sorted_sfams.json")
    with open(sfam_file, "w") as f:
        json.dump(sfams, f)
    out_store.write_output_file(sfam_file, "sorted_sfams.json")

    return sfams[:max_sfams]
Example #8
0
def process_sfam(job, sfam_id, pdbFileStoreID, cores=1):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-full-structures".format(prefix))

    sdoms_file = copy_pdb_h5(job, pdbFileStoreID)

    sdoms = pd.read_hdf(unicode(sdoms_file),
                        "merged")  #, where="sfam_id == {}".format(sfam_id))
    # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv")
    # if os.path.isfile(skip_file):
    #     skip = pd.read_csv(skip_file)
    #     sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])]

    sdoms = sdoms[sdoms["sfam_id"] == float(
        sfam_id)]["sdi"].drop_duplicates().dropna()
    #sdoms = sdoms[:1]

    if cores > 2:
        #Only makes sense for slurm or other bare-matal clsuters
        setup_dask(cores)
        d_sdoms = dd.from_pandas(sdoms, npartitions=cores)
        RealtimeLogger.info("Running sfam dask {}".format(sdoms))
        processed_domains = d_sdoms.apply(
            lambda row: process_domain(job, row.sdi, sdoms_file),
            axis=1).compute()
    else:
        processed_domains = job.addChildJobFn(map_job_rv,
                                              process_domain,
                                              sdoms,
                                              pdbFileStoreID,
                                              preemptable=True).rv()

    return processed_domains
Example #9
0
def run_split_fastq(job, context, fastq, fastq_i, sample_fastq_id):

    RealtimeLogger.info("Starting fastq split")
    start_time = timeit.default_timer()

    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    fastq_name = os.path.basename(fastq[fastq_i])
    fastq_path = os.path.join(work_dir, fastq_name)
    fastq_gzipped = os.path.splitext(fastq_name)[1] == '.gz'
    fastq_name = os.path.splitext(fastq_name)[0]
    if fastq_gzipped:
        fastq_name = os.path.splitext(fastq_name)[0]
    job.fileStore.readGlobalFile(sample_fastq_id, fastq_path)

    # Split up the fastq into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    # 4 lines per read
    chunk_lines = chunk_size * 4

    # Note we do this on the command line because Python is too slow
    if fastq_gzipped:
        cmd = [['gzip', '-d', '-c', os.path.basename(fastq_path)]]
    else:
        cmd = [['cat', os.path.basename(fastq_path)]]

    cmd.append([
        'split', '-l',
        str(chunk_lines), '--filter=pigz -p {} > $FILE.fq.gz'.format(
            max(1,
                int(context.config.fq_split_cores) - 1)), '-',
        '{}-chunk.'.format(fastq_name)
    ])

    context.runner.call(job, cmd, work_dir=work_dir, tool_name='pigz')

    fastq_chunk_ids = []
    for chunk_name in sorted(os.listdir(work_dir)):
        if chunk_name.endswith('.fq.gz') and chunk_name.startswith(
                '{}-chunk'.format(fastq_name)):
            fastq_chunk_ids.append(
                context.write_intermediate_file(
                    job, os.path.join(work_dir, chunk_name)))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info(
        "Split fastq into {} chunks. Process took {} seconds.".format(
            len(fastq_chunk_ids), run_time))

    return fastq_chunk_ids
Example #10
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from the filesystem.
        """

        RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format(
            input_path, self.path_prefix, local_path))

        if os.path.exists(local_path):
            # Try deleting the existing item if it already exists
            try:
                os.unlink(local_path)
            except:
                # Don't fail here, fail complaining about the assertion, which
                # will be more informative.
                pass

        # Make sure the path is clear for copying
        assert (not os.path.exists(local_path))

        # Where is the file actually?
        real_path = os.path.abspath(os.path.join(self.path_prefix, input_path))

        if not os.path.exists(real_path):
            RealtimeLogger.error(
                "Can't find {} from FileIOStore in {}!".format(
                    input_path, self.path_prefix))
            raise RuntimeError("File {} missing!".format(real_path))

        # Make a temporary file
        temp_handle, temp_path = tempfile.mkstemp(
            dir=os.path.dirname(local_path))
        os.close(temp_handle)

        # Copy to the temp file
        shutil.copy2(real_path, temp_path)

        # Rename the temp file to the right place, atomically
        RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path))
        os.rename(temp_path, local_path)

        # Look at the file stats
        file_stats = os.stat(real_path)

        if (file_stats.st_uid == os.getuid()
                and file_stats.st_mode & stat.S_IWUSR):
            # We own this file and can write to it. We don't want the user
            # script messing it up through the symlink.

            try:
                # Clear the user write bit, so the user can't accidentally
                # clobber the file in the actual store through the symlink.
                os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR)
            except OSError:
                # If something goes wrong here (like us not having permission to
                # change permissions), ignore it.
                pass
Example #11
0
def ensure_disk(job,
                job_fn,
                job_fn_args,
                job_fn_kwargs,
                file_id_list,
                factor=8,
                padding=1024**3):
    """
    Ensure that the currently running job has enough disk to load all the given
    file IDs (passed as any number of lists of file IDs), and process them,
    producing factor times as much data, plus padding.
    
    Takes the job, the function that is the job, the list of arguments passed
    in (except the job object), the dict of keyword args passed in, and then
    a file ID list or iterable.
    
    If there is not enough disk, re-queues the job with more disk, and returns
    the promise for the result.
    
    If there is enough disk, returns None
    
    TODO: Convert to promised requirements if it is sufficiently expressive.
    """

    # We need to compute the total size of our inputs, expected intermediates,
    # and outputs, and re-queue ourselves if we don't have enough disk.
    required_disk = 0

    for file_id in file_id_list:
        # For each file in the collection
        # Say we need space for it
        required_disk += file_id.size

    # Multiply out for intermediates and results
    # TODO: Allow different factors for different file IDs
    # We only need to multiply e.g. BAM files, not indexes
    required_disk *= factor

    # Add some padding
    required_disk += padding

    if job.disk < required_disk:
        # Re-queue with more disk
        RealtimeLogger.info(
            "Re-queueing job with {} bytes of disk; originally had {}".format(
                required_disk, job.disk))
        requeued = job.addChildJobFn(job_fn,
                                     *job_fn_args,
                                     cores=job.cores,
                                     memory=job.memory,
                                     disk=required_disk,
                                     **job_fn_kwargs)
        return requeued.rv()
    else:
        # Disk we have is OK
        return None
Example #12
0
def gcsa_index_job(job, options, vg_ids, primary_path_names=None):
    """
    Index the given graphs into a GCSA/LCP index, and return a pair of file IDs
    for the GCSA and the LCP files.
    
    Will prune the graph before indexing unless options.prune_opts is explicitly
    set as an empty list.
    
    """

    # Do any options manipulation we need to do

    # Strip out stuff we don't want and apply config defaults
    options = sanitize_options(options)

    # Add the outstore, which we have sort of disabled. It insists on writing
    # stuff, so just drop it in the current directory. It doesn't read it back.
    options.out_store = "file:."

    # Don't use outstore instead of the file store
    options.force_outstore = False

    # Pretend we're the pipeline tool
    options.tool = "pipeline"

    # Add stuff that toil vg index uses

    # options.graphs has to have a name for every graph, to save it under in the
    # local temp dir.
    options.graphs = ["graph{}".format(i) for i in xrange(len(vg_ids))]

    # We also need a "chroms" giving the primary path for each graph. It's OK if
    # the path doesn't exist in a given graph, but if it does it will be added
    # to the index.

    # We have primary path names to use. We can just try and retain all ther
    # paths in all graphs.
    RealtimeLogger.info("Want to GCSA-index {} with paths {}".format(
        vg_ids, primary_path_names))

    # Fake path names
    options.chroms = ["fake{}".format(i) for i in xrange(len(vg_ids))]

    # options.index_name has to have the basename for the .gcsa in the local
    # temp dir.
    options.index_name = "gcsaindex"

    return job.addChildJobFn(toil_vg.vg_index.run_gcsa_prep,
                             options,
                             vg_ids,
                             primary_path_override=primary_path_names,
                             cores=options.misc_cores,
                             memory=options.misc_mem,
                             disk=options.misc_disk).rv()
Example #13
0
    def call_with_singularity(self, job, args, work_dir, outfile, errfile, check_output, tool_name, mount_list): 
        """ Thin wrapper for singularity_call that will use internal lookup to
        figure out the location of the singularity file.  Only exposes singularity_call
        parameters used so far.  expect args as list of lists.  if (toplevel)
        list has size > 1, then piping interface used """

        RealtimeLogger.info(truncate_msg("Singularity Run: {}".format(" | ".join(" ".join(x) for x in args))))
        start_time = timeit.default_timer()

        # we use the first argument to look up the tool in the singularity map
        # but allow overriding of this with the tool_name parameter
        name = tool_name if tool_name is not None else args[0][0]
        tool = self.docker_tool_map[name]
        parameters = args[0] if len(args) == 1 else args
        
        # Get a lock on the environment
        global environment_lock
        with environment_lock:
            # TODO: We can't stop other threads using os.environ or subprocess or w/e on their own

            # Set the locale to C for consistent sorting, and activate vg traceback     
            update_env = {'LC_ALL' : 'C', 'VG_FULL_TRACEBACK': '1'}
            if name == 'Rscript':
                # The R dockers by default want to install packages in non-writable directories. Sometimes.
                # Make sure a writable directory which exists is used.
                update_env['R_LIBS']='/tmp'
            old_env = {}
            for env_name, env_val in list(update_env.items()):
                old_env[env_name] = os.environ.get(env_name)
                os.environ[env_name] = env_val
            
            if check_output is True:
                ret = singularityCheckOutput(job, tool, parameters=parameters, workDir=work_dir, mount_list=mount_list)
            else:
                ret = singularityCall(job, tool, parameters=parameters, workDir=work_dir, outfile = outfile, mount_list=mount_list)
            
            # Restore old locale and vg traceback
            for env_name, env_val in list(update_env.items()):
                if old_env[env_name] is not None:
                    os.environ[env_name] = old_env[env_name]
                else:
                    del os.environ[env_name]
        
        end_time = timeit.default_timer()
        run_time = end_time - start_time
        RealtimeLogger.info("Successfully singularity ran {} in {} seconds.".format(
            " | ".join(" ".join(x) for x in args), run_time))

        if outfile:
            outfile.flush()
            os.fsync(outfile.fileno())
        
        return ret
Example #14
0
def setup(job, inputFile, N, downCheckpoints, options):
    """
    Sets up the sort.
    Returns the FileID of the sorted file
    """
    RealtimeLogger.info("Starting the merge sort")
    return job.addChildJobFn(down,
                             inputFile, N, 'root',
                             downCheckpoints,
                             options = options,
                             preemptable=True,
                             memory=sortMemory).rv()
Example #15
0
    def list_input_directory(self,
                             input_path,
                             recursive=False,
                             with_times=False):
        """
        Loop over directories on the filesystem.
        """

        RealtimeLogger.info("Enumerating {} from "
                            "FileIOStore in {}".format(input_path,
                                                       self.path_prefix))

        if not os.path.exists(os.path.join(self.path_prefix, input_path)):
            # Nothing to list over
            return

        if not os.path.isdir(os.path.join(self.path_prefix, input_path)):
            # Can't list a file, only a directory.
            return

        for item in os.listdir(os.path.join(self.path_prefix, input_path)):
            if (recursive and os.path.isdir(
                    os.path.join(self.path_prefix, input_path, item))):
                # We're recursing and this is a directory.
                # Recurse on this.
                for subitem in self.list_input_directory(
                        os.path.join(input_path, item), recursive):

                    # Make relative paths include this directory name and yield
                    # them
                    name_to_yield = os.path.join(item, subitem)

                    if with_times:
                        # What is the mtime in seconds since epoch?
                        mtime_epoch_seconds = os.path.getmtime(
                            os.path.join(input_path, item, subitem))
                        # Convert it to datetime

                        yield name_to_yield, mtime_epoch_seconds
                    else:
                        yield name_to_yield
            else:
                # This isn't a directory or we aren't being recursive
                # Just report this individual item.

                if with_times:
                    # What is the mtime in seconds since epoch?
                    mtime_epoch_seconds = os.path.getmtime(
                        os.path.join(input_path, item))

                    yield item, mtime_epoch_seconds
                else:
                    yield item
Example #16
0
def convert_job(job, options, sam_url, bam_id):
    """
    Subset and convert BAM to FASTQ pair. Returns FASTQ IDs.
    """

    # We have to deal with relative paths relative to here if we want Docker to
    # work right
    work_dir = job.fileStore.getLocalTempDir()

    # Read the BAM back, into the work_dir
    sorted_bam = "sorted.bam"
    job.fileStore.readGlobalFile(bam_id, os.path.join(work_dir, sorted_bam))

    RealtimeLogger.info("Subset {} to SAM".format(sam_url))

    # Then stream to SAM and select just the reads we want. This file is used by
    # this Python code as a soy-based FIFO substitute, and so doesn't need to be
    # in work_dir.
    subset_sam = job.fileStore.getLocalTempDir() + "/subset.sam"

    # We start out with just a view pipeline
    sam_command = [["samtools", "view", sorted_bam]]

    if options.contig is not None:
        # Subset to this contig and related alts with awk
        sam_command.append([
            "awk",
            ("{if ($3 ~ /" + options.contig + "(_.*)?$/ || $7 ~ /" +
             options.contig + "(_.*)?$/) print}")
        ])

    options.drunner.call(job,
                         sam_command,
                         outfile=open(subset_sam, "w"),
                         work_dir=work_dir)

    RealtimeLogger.info("Convert {} to FASTQ".format(sam_url))

    with job.fileStore.writeGlobalFileStream() as (fq1_handle, fq1_id):
        with job.fileStore.writeGlobalFileStream() as (fq2_handle, fq2_id):

            # Then prep options for running the converter script in this Python
            convert_options = argparse.Namespace()
            convert_options.input_sam = open(subset_sam, "r")
            convert_options.fq1 = fq1_handle
            convert_options.fq2 = fq2_handle
            convert_options.drop_secondary = True
            convert_options.expect_paired = True
            convert_options.interleaved = False

            smartSam2Fastq.run(convert_options)

            return fq1_id, fq2_id
Example #17
0
def run_split_bam_reads(job, context, bam_input_reads, bam_reads_file_id):
    """ split up an input reads file in BAM format
    """
    RealtimeLogger.info("Starting bam split")
    start_time = timeit.default_timer()

    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    bam_path = os.path.join(work_dir, os.path.basename(bam_input_reads))
    job.fileStore.readGlobalFile(bam_reads_file_id, bam_path)

    # Split up the bam into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    # 1 line per read
    chunk_lines = chunk_size * 1

    cmd = [['samtools', 'view', os.path.basename(bam_path)]]
    cmd.append([
        'split', '-l',
        str(chunk_lines),
        '--filter=bash -c \'cat <(samtools view -H {}) <(cat -)'.format(
            os.path.basename(bam_path)) +
        ' | samtools view -O BAM --threads {} -'.format(
            max(1,
                int(context.config.fq_split_cores) - 1)) + ' > $FILE.bam\'',
        '-', 'bam_reads_chunk.'
    ])

    context.runner.call(job, cmd, work_dir=work_dir)

    bam_chunk_ids = []
    for chunk_name in sorted(os.listdir(work_dir)):
        if chunk_name.endswith('.bam') and chunk_name.startswith(
                'bam_reads_chunk'):
            bam_chunk_ids.append(
                context.write_intermediate_file(
                    job, os.path.join(work_dir, chunk_name)))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info(
        "Split bam into {} chunks. Process took {} seconds.".format(
            len(bam_chunk_ids), run_time))

    return bam_chunk_ids
def start_toil(job):
    print "Starting job"
    work_dir = job.fileStore.getLocalTempDir()
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    int_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    #Download PDB info
    pdb_file = os.path.join(work_dir, "PDB.h5")
    in_store.read_input_file("PDB.h5", pdb_file)

    #Add pdb info into local job store
    pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file)

    #Download PDB Taxonomy information
    tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5")
    in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file)

    #Add tax info into local job store
    taxFileStoreID = job.fileStore.writeGlobalFile(tax_file)

    tables = set(range(1,87))-set([51])

    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
        ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values()
    #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0]))
    sfamFileStoreIDs = {}
    for s in sfams:
        k = "{0}/{0}.observed_interactome".format(int(s))
        if int_store.exists(k):
            RealtimeLogger.info("Loading {}".format(s))
            f = job.fileStore.getLocalTempFileName()
            int_store.read_input_file(k, f)
            sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f)
            os.remove(f)
        else:
            RealtimeLogger.info("FAILED Loading {}".format(s))

    assert len(sfamFileStoreIDs) > 0

    os.remove(tax_file)
    os.remove(pdb_file)

    job.log("Running tables: {}".format(tables))
    j = job
    for table in table:
        j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    # map_job(job, get_inferred_structural_interactome_by_table, tables,
    #     pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
Example #19
0
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved,
                      xg_file_id, paths):
    """
    Surject all gam chunks in parallel.
    
    surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name.
    
    If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output.
    
    Surjects against the given collection of paths in the given XG file.
    
    """

    RealtimeLogger.info(
        "Surjecting read chunks {} to BAM".format(reads_chunk_ids))

    # this will be a list of lists.
    # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    bam_chunk_file_ids = []
    bam_chunk_running_times = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph surject on each gam chunk
        chunk_surject_job = child_job.addChildJobFn(
            run_chunk_surject,
            context,
            interleaved,
            xg_file_id,
            paths,
            chunk_filename_ids,
            '{}_chunk{}'.format(output_name, chunk_id),
            cores=context.config.alignment_cores,
            memory=context.config.alignment_mem,
            disk=context.config.alignment_disk)
        bam_chunk_file_ids.append(chunk_surject_job.rv(0))
        bam_chunk_running_times.append(chunk_surject_job.rv(1))

    return child_job.addFollowOnJobFn(run_merge_bams,
                                      context,
                                      output_name,
                                      bam_chunk_file_ids,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
Example #20
0
def run_concat_files(job, context, file_ids, dest_name=None, header=None):
    """
    Utility job to concatenate some files. Returns the concatenated file ID.
    If given a dest_name, writes the result to the out store with the given name.
    (We wanted to use name, but that kwarg is reserved by Toil.)
    If given a header, prepends the header to the file with a trailing newline.
    """

    requeue_promise = ensure_disk(job,
                                  run_concat_files, [context, file_ids], {
                                      "dest_name": dest_name,
                                      "header": header
                                  },
                                  file_ids,
                                  factor=2)
    if requeue_promise is not None:
        # We requeued ourselves with more disk to accomodate our inputs
        return requeue_promise

    work_dir = job.fileStore.getLocalTempDir()

    out_name = os.path.join(work_dir,
                            'output.dat' if dest_name is None else dest_name)

    # Concatenate all the files
    # TODO: We don't use the trick where we append to the first file to save a copy. Should we?
    with open(out_name, 'w') as out_file:
        if header is not None:
            # Put the header if specified
            out_file.write(header + '\n')
        for file_id in file_ids:
            with job.fileStore.readGlobalFileStream(file_id) as in_file:
                # Then beam over each file
                shutil.copyfileobj(in_file, out_file)

    if dest_name is None:
        # Send back an intermediate file
        RealtimeLogger.info(
            "Concatenated {} files into intermediate file {}".format(
                len(file_ids), out_name))
        return context.write_intermediate_file(job, out_name)
    else:
        # Write to outstore under the given name.
        RealtimeLogger.info(
            "Concatenated {} files into output file {} -> {}".format(
                len(file_ids), out_name, dest_name))
        return context.write_output_file(job, out_name, dest_name)
Example #21
0
    def calculate_features_for_atom(self, atom, only_aa=False, only_atom=False,
      non_geom_features=False, use_deepsite_features=False, warn_if_buried=False):
        if use_deepsite_features:
            features = self.get_deepsite_features(atom)
            if warn_if_buried:
                is_burried = self.get_accessible_surface_area(atom)[-2]
        elif only_atom:
            features = self.get_element_type(atom)
            if warn_if_buried:
                is_buried = self.get_accessible_surface_area(atom)[-2]
        elif only_aa:
            features = self.get_residue(atom)
            if warn_if_buried:
                is_buried = self.get_accessible_surface_area(atom)[-2]
        elif non_geom_features:
            features = np.zeros(13)
            features[0:5] = self.get_element_type(atom)
            features[5:9] = self.get_charge_and_electrostatics(atom)
            features[9:13] = self.get_hydrophobicity(atom)
            is_buried = self.get_accessible_surface_area(atom)[-2]
        else:
            features = np.empty(self.n_atom_features)

            features[0:13]  = self.get_atom_type(atom)
            features[13:18] = self.get_element_type(atom)
            features[18:19] = self.get_vdw(atom)
            features[19:26] = self.get_charge_and_electrostatics(atom)
            features[26:30] = self.get_concavity(atom)
            features[30:34] = self.get_hydrophobicity(atom)
            features[34:40] = self.get_accessible_surface_area(atom)
            features[40:61] = self.get_residue(atom)
            features[61:64] = self.get_ss(atom)
            features[64:70] = self.get_deepsite_features(atom, calc_charge=False,
                calc_conservation=False)
            features[70:73] = self.get_evolutionary_conservation_score(atom)

            is_buried = bool(features[35])

        RealtimeLogger.info("Finished atom {}".format(atom))

        self.atom_features[atom.serial_number-1] = features

        if warn_if_buried:
            return features, is_buried
        else:
            return features
Example #22
0
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory):
    """
    Input is a file, a subdivision size N, and a path in the hierarchy of jobs.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    
    RealtimeLogger.info("Down job starting: %s" % path)
    
    # Read the file
    inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False)
    length = os.path.getsize(inputFile)
    if length > N:
        # We will subdivide the file
        RealtimeLogger.critical("Splitting file: %s of size: %s"
                % (inputFileStoreID, length))
        # Split the file into two copies
        midPoint = getMidPoint(inputFile, 0, length)
        t1 = job.fileStore.getLocalTempFile()
        with open(t1, 'w') as fH:
            fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1))
        t2 = job.fileStore.getLocalTempFile()
        with open(t2, 'w') as fH:
            fH.write(copySubRangeOfFile(inputFile, midPoint+1, length))
        # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up,
        # we communicate the dependency without hindering concurrency.
        result = job.addFollowOnJobFn(up,
                                    job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0',
                                                      downCheckpoints, checkpoint=downCheckpoints, options=options,
                                                      preemptable=True, memory=options.sortMemory).rv(),
                                    job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1',
                                                      downCheckpoints, checkpoint=downCheckpoints, options=options,
                                                      preemptable=True, memory=options.mergeMemory).rv(),
                                    path + '/up', preemptable=True, options=options, memory=options.sortMemory).rv()
    else:
        # We can sort this bit of the file
        RealtimeLogger.critical("Sorting file: %s of size: %s"
                % (inputFileStoreID, length))
        # Sort the copy and write back to the fileStore
        shutil.copyfile(inputFile, inputFile + '.sort')
        sort(inputFile + '.sort')
        result = job.fileStore.writeGlobalFile(inputFile + '.sort')
        
    RealtimeLogger.info("Down job finished: %s" % path)
    return result
Example #23
0
def create_data_loader(job, sfam_id, preemptable=True):
    """Create H5 for Molmimic3dCNN to read

    Note: move this somewhere else
    """
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]

    pdb_path = os.path.join(work_dir, "pdb")
    if not os.path.isdir(pdb_path):
        os.makedirs(pdb_path)

    id_format = re.compile(
        "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$")

    #Get all with keys same sfam, but do not download

    in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix))
    keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \
        if f.endswith(".pdb") and id_format.match(f)]

    pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily",
                            str(int(sfam_id)))
    clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id)))

    try:
        pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \
            for s in SeqIO.parse(clusters_file, "fasta")])
    except ValueError:
        RealtimeLogger.info(
            "Unable to create data loading file for {}.".format(sfam_id))
        return

    domains = pd.DataFrame({
        "pdb": pdb,
        "chain": chain,
        "domNo": domain,
        "sdi": sdi
    })

    data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id)))
    domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
Example #24
0
def get_original_complexes(job,
                           mol_sfam,
                           int_sfam,
                           group,
                           int_type,
                           work_dir=None):
    if work_dir is None:
        work_dir = os.cwd()

    complex_files = []
    for _, row in group:
        row = row.iloc[0]
        RealtimeLogger.info("ROW: {}".format(row))

        mol_file, mol_resi, int_file, int_resi = process_interface(
            job, row, int_type, work_dir=work_dir)
        # try:
        #     mol_file = download_pdb(job, row.mol_superfam_id, row.mol_pdb,
        #         row.mol_chain, row.mol_sdi_id, row.mol_domNo, work_dir=work_dir)
        #
        #     int_file = download_pdb(job, row.int_superfam_id, row.int_pdb,
        #         row.int_chain, row.int_sdi_id, row.int_domNo, work_dir=work_dir)
        # except (KeyboardInterrupt, SystemExit):
        #     raise
        # except Exception as e:
        if mol_file is None or int_file is None:
            #PDB files not found, skip
            RealtimeLogger.info(
                "Cannot download PDB {}.{}.{} bc it was not found".format(
                    row.mol_pdb, row.mol_chain, row.mol_sdi_id))
            complex_files.append(None)
            continue

        merged_file = next(
            prep((mol_file, "M"), (int_file, "I"),
                 merge=True,
                 work_dir=work_dir))

        complex_files.append(merged_file)

    return complex_files
Example #25
0
def up(job, inputFileID1, inputFileID2, options, memory=sortMemory):
    """
    Merges the two files and places them in the output.
    """
    with job.fileStore.writeGlobalFileStream() as (fileHandle,
                                                   outputFileStoreID):
        fileHandle = codecs.getwriter('utf-8')(fileHandle)
        with job.fileStore.readGlobalFileStream(
                inputFileID1) as inputFileHandle1:
            inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1)
            with job.fileStore.readGlobalFileStream(
                    inputFileID2) as inputFileHandle2:
                inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2)
                RealtimeLogger.info(
                    "Merging %s and %s to %s" %
                    (inputFileID1, inputFileID2, outputFileStoreID))
                merge(inputFileHandle1, inputFileHandle2, fileHandle)
        # Cleanup up the input files - these deletes will occur after the completion is successful.
        job.fileStore.deleteGlobalFile(inputFileID1)
        job.fileStore.deleteGlobalFile(inputFileID2)
        return outputFileStoreID
Example #26
0
def copy_everything(job, options):
    """
    Download the file list and copy all the files.
    
    """

    # Set up the IO stores.
    in_store = IOStore.get(options.in_store)
    out_store = IOStore.get(options.out_store)

    batch_count = 0

    # List all the files.
    blobs_iterator = in_store.list_input_directory("", recursive=True)

    # Make an iterator that filters them
    filtered_iterator = (x for x in blobs_iterator
                         if fnmatch.fnmatchcase(x, options.pattern))

    # Batch them up
    for batch in group(filtered_iterator, options.batch_size):

        # For every batch, strip out any Nones that got put in when grouping
        batch = [x for x in batch if x is not None]

        # Copy everything in that batch
        job.addChildJobFn(copy_batch,
                          options,
                          batch,
                          cores=1,
                          memory="1G",
                          disk="10G")

        batch_count += 1

        if batch_count % 10 == 0:

            RealtimeLogger.info("Queued {} batches...".format(batch_count))

    RealtimeLogger.info("Queued {} total batches".format(batch_count))
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    RealtimeLogger.info("Running table {}".format(table))

    #Read in H5 for entire table
    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store)
    tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath)

    sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table),
        columns=["nbr_superfam_id"]).drop_duplicates().dropna()
    skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \
        out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \
        not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))])

    # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
    #    if f.endswith(".inferred_interactome")])

    sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)]
    sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist()

    # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \
    #     out_store.list_input_directory(
    #         "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \
    #         sfam=sfam, table=table)) if not k.endswith("failed"))

    #sfams = list(set(sfams)-partial_sfams)

    if len(sfams) > 0:
        map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)

    try:
        os.remove(tableInfPath)
    except OSError:
        pass
Example #28
0
def main_job(job, options, sam_urls):
    """
    Root job of the Toil workflow. Download the sample URLs.
    
    Returns a Directory containing a bunch of output files.
    
    """

    RealtimeLogger.info("Main job starting")
    RealtimeLogger.info("Temp directory location: {}".format(
        job.fileStore.getLocalTempDir()))

    # Make sure we can use samtools
    options.drunner.call(job, [["samtools", "--version"]])

    # We'll fill this with promises for subdirectories by sample filename
    subdir_promises = {}

    for sam_url in sam_urls:
        # Work out the base filename
        sam_filename = os.path.basename(urlparse.urlparse(sam_url).path)

        # Go download and convert the reads, and stick the FASTQs in a directory
        subdir_promises[sam_filename] = ToilPromise.wrap(
            job.addChildJobFn(extract_job,
                              options,
                              sam_url,
                              cores=1,
                              memory="16G",
                              disk="500G")).then(
                                  lambda (fq1, fq2): Directory({
                                      "fq1.fastq": fq1,
                                      "fq2.fastq": fq2
                                  }))

    # Mount each subdirectory under its original sam/bam/cram filename
    return ToilPromise.all(subdir_promises).then(
        lambda dirs: Directory().mount_all(dirs)).unwrap_result()
Example #29
0
def extract_job(job, options, sam_url):
    """
    Extract and fix up the given SAM/BAM/CRAM reads by URL.
    
    Return a pair of FASTQ file IDs.
    """

    # We have to deal with relative paths relative to here if we want Docker to
    # work right
    work_dir = job.fileStore.getLocalTempDir()

    # Let's just download the whole bam
    sorted_bam = "sorted.bam"

    # We need a prefix for temp files
    temp_prefix = sorted_bam + ".part"

    RealtimeLogger.info("Sort {} to BAM".format(sam_url))

    # Sort reads by name to a BAM file. If we don't give a temp file prefix it
    # tries to write the temp files back to the FTP.
    options.drunner.call(job, [[
        "samtools", "sort", "-n", "-o", sorted_bam, "-T", temp_prefix, sam_url
    ]],
                         work_dir=work_dir)

    # Save to file store
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, sorted_bam))

    # Convert and return FASTQs
    return job.addChildJobFn(convert_job,
                             options,
                             sam_url,
                             bam_id,
                             cores=4,
                             memory="16G",
                             disk="500G").rv()
Example #30
0
    def download(filename):
        """
        Download each file
        """

        try:

            if (not options.overwrite) and out_store.exists(filename):
                # File exists. But make sure its size is correct.

                if not options.check_size:
                    # Skip existing file. No need to check the length.
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

                out_size = out_store.get_size(filename)
                in_size = in_store.get_size(filename)
                if out_size != in_size:
                    # Complain about size mismatch and copy
                    RealtimeLogger.warning(
                        "Redownloading {}! Size was {} and not {}!".format(
                            filename, out_size, in_size))
                else:
                    # Skip existing file
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

            # Make a temp file
            (handle,
             path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir())
            os.close(handle)

            RealtimeLogger.debug("Download {}".format(filename))

            # Download
            in_store.read_input_file(filename, path)
            # Store
            out_store.write_output_file(path, filename)

            # Clean up
            os.unlink(path)

        except:
            # Put all exception text into an exception and raise that
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))

        RealtimeLogger.info("Copied {}".format(filename))
Example #31
0
 def run(self, fileStore):
     RealtimeLogger.info('This should be logged at info level')
     RealtimeLogger.debug('This should be logged at debug level')