Ejemplo n.º 1
0
def setup(job, inputFile, N, downCheckpoints, options):
    """
    Sets up the sort.
    Returns the FileID of the sorted file
    """
    RealtimeLogger.info("Starting the merge sort")
    return job.addChildJobFn(down,
                             inputFile,
                             N,
                             downCheckpoints,
                             options=options,
                             preemptable=True,
                             memory=sortMemory).rv()
Ejemplo n.º 2
0
    def __connect(self):
        """
        Make sure we have an Azure connection, and set one up if we don't.
        """

        if self.connection is None:
            RealtimeLogger.debug("Connecting to account {}, using "
                                 "container {} and prefix {}".format(
                                     self.account_name, self.container_name,
                                     self.name_prefix))

            # Connect to the blob service where we keep everything
            self.connection = BlobService(account_name=self.account_name,
                                          account_key=self.account_key)
Ejemplo n.º 3
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from Azure.
        """

        self.__connect()

        RealtimeLogger.debug("Loading {} from AzureIOStore".format(input_path))

        # Download the blob. This is known to be synchronous, although it can
        # call a callback during the process.
        self.connection.get_blob_to_path(self.container_name,
                                         self.name_prefix + input_path,
                                         local_path)
def start_toil(job):
    print "Starting job"
    work_dir = job.fileStore.getLocalTempDir()
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    int_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    #Download PDB info
    pdb_file = os.path.join(work_dir, "PDB.h5")
    in_store.read_input_file("PDB.h5", pdb_file)

    #Add pdb info into local job store
    pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file)

    #Download PDB Taxonomy information
    tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5")
    in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file)

    #Add tax info into local job store
    taxFileStoreID = job.fileStore.writeGlobalFile(tax_file)

    tables = set(range(1,87))-set([51])

    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
        ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values()
    #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0]))
    sfamFileStoreIDs = {}
    for s in sfams:
        k = "{0}/{0}.observed_interactome".format(int(s))
        if int_store.exists(k):
            RealtimeLogger.info("Loading {}".format(s))
            f = job.fileStore.getLocalTempFileName()
            int_store.read_input_file(k, f)
            sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f)
            os.remove(f)
        else:
            RealtimeLogger.info("FAILED Loading {}".format(s))

    assert len(sfamFileStoreIDs) > 0

    os.remove(tax_file)
    os.remove(pdb_file)

    job.log("Running tables: {}".format(tables))
    j = job
    for table in table:
        j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    # map_job(job, get_inferred_structural_interactome_by_table, tables,
    #     pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
Ejemplo n.º 5
0
    def _runDeferredFunction(self, deferredFunction):
        """
        Run a deferred function (either our own or someone else's).

        Reports an error if it fails.
        """

        try:
            deferredFunction.invoke()
        except Exception as err:
            # Report this in real time, if enabled. Otherwise the only place it ends up is the worker log.
            RealtimeLogger.error("Failed to run deferred function %s: %s", repr(deferredFunction), str(err))
        except:
            RealtimeLogger.error("Failed to run deferred function %s", repr(deferredFunction))
Ejemplo n.º 6
0
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved,
                      xg_file_id, paths):
    """
    Surject all gam chunks in parallel.
    
    surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name.
    
    If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output.
    
    Surjects against the given collection of paths in the given XG file.
    
    """

    RealtimeLogger.info(
        "Surjecting read chunks {} to BAM".format(reads_chunk_ids))

    # this will be a list of lists.
    # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    bam_chunk_file_ids = []
    bam_chunk_running_times = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph surject on each gam chunk
        chunk_surject_job = child_job.addChildJobFn(
            run_chunk_surject,
            context,
            interleaved,
            xg_file_id,
            paths,
            chunk_filename_ids,
            '{}_chunk{}'.format(output_name, chunk_id),
            cores=context.config.alignment_cores,
            memory=context.config.alignment_mem,
            disk=context.config.alignment_disk)
        bam_chunk_file_ids.append(chunk_surject_job.rv(0))
        bam_chunk_running_times.append(chunk_surject_job.rv(1))

    return child_job.addFollowOnJobFn(run_merge_bams,
                                      context,
                                      output_name,
                                      bam_chunk_file_ids,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
Ejemplo n.º 7
0
def run_split_fastq(job, context, fastq, fastq_i, sample_fastq_id):
    
    RealtimeLogger.info("Starting fastq split")
    start_time = timeit.default_timer()
    
    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    fastq_name = os.path.basename(fastq[fastq_i])
    fastq_path = os.path.join(work_dir, fastq_name)
    fastq_gzipped = os.path.splitext(fastq_name)[1] == '.gz'
    fastq_name = os.path.splitext(fastq_name)[0]
    if fastq_gzipped:
        fastq_name = os.path.splitext(fastq_name)[0]
    job.fileStore.readGlobalFile(sample_fastq_id, fastq_path)

    # Split up the fastq into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    # 4 lines per read
    chunk_lines = chunk_size * 4

    # Note we do this on the command line because Python is too slow
    if fastq_gzipped:
        cmd = [['gzip', '-d', '-c', os.path.basename(fastq_path)]]
    else:
        cmd = [['cat', os.path.basename(fastq_path)]]

    cmd.append(['split', '-l', str(chunk_lines),
                '--filter=pigz -p {} > $FILE.fq.gz'.format(max(1, int(context.config.fq_split_cores) - 1)),
                '-', '{}-chunk.'.format(fastq_name)])

    context.runner.call(job, cmd, work_dir = work_dir, tool_name='pigz')

    fastq_chunk_ids = []
    for chunk_name in sorted(os.listdir(work_dir)):
        if chunk_name.endswith('.fq.gz') and chunk_name.startswith('{}-chunk'.format(fastq_name)):
            fastq_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name)))
        
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info("Split fastq into {} chunks. Process took {} seconds.".format(len(fastq_chunk_ids), run_time))

    return fastq_chunk_ids
Ejemplo n.º 8
0
 def handle_reject(self, job, err):
     """
     Handle promise rejection.
     """
     
     Logger.error("Promise Rejected: {}".format(err))
     RealtimeLogger.error("Promise Rejected: {}".format(err))
     
     self.err = err
     
     # TODO: implement
     # Check if we have any reject handlers
     # If so call them
     # If not throw an error that stops the workflow
     raise err
Ejemplo n.º 9
0
    def download(filename):
        """
        Download each file
        """

        try:

            if (not options.overwrite) and out_store.exists(filename):
                # File exists. But make sure its size is correct.

                if not options.check_size:
                    # Skip existing file. No need to check the length.
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

                out_size = out_store.get_size(filename)
                in_size = in_store.get_size(filename)
                if out_size != in_size:
                    # Complain about size mismatch and copy
                    RealtimeLogger.warning(
                        "Redownloading {}! Size was {} and not {}!".format(
                            filename, out_size, in_size))
                else:
                    # Skip existing file
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

            # Make a temp file
            (handle,
             path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir())
            os.close(handle)

            RealtimeLogger.debug("Download {}".format(filename))

            # Download
            in_store.read_input_file(filename, path)
            # Store
            out_store.write_output_file(path, filename)

            # Clean up
            os.unlink(path)

        except:
            # Put all exception text into an exception and raise that
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))

        RealtimeLogger.info("Copied {}".format(filename))
Ejemplo n.º 10
0
def run_concat_files(job, context, file_ids, dest_name=None, header=None):
    """
    Utility job to concatenate some files. Returns the concatenated file ID.
    If given a dest_name, writes the result to the out store with the given name.
    (We wanted to use name, but that kwarg is reserved by Toil.)
    If given a header, prepends the header to the file with a trailing newline.
    """

    requeue_promise = ensure_disk(job,
                                  run_concat_files, [context, file_ids], {
                                      "dest_name": dest_name,
                                      "header": header
                                  },
                                  file_ids,
                                  factor=2)
    if requeue_promise is not None:
        # We requeued ourselves with more disk to accomodate our inputs
        return requeue_promise

    work_dir = job.fileStore.getLocalTempDir()

    out_name = os.path.join(work_dir,
                            'output.dat' if dest_name is None else dest_name)

    # Concatenate all the files
    # TODO: We don't use the trick where we append to the first file to save a copy. Should we?
    with open(out_name, 'w') as out_file:
        if header is not None:
            # Put the header if specified
            out_file.write(header + '\n')
        for file_id in file_ids:
            with job.fileStore.readGlobalFileStream(file_id) as in_file:
                # Then beam over each file
                shutil.copyfileobj(in_file, out_file)

    if dest_name is None:
        # Send back an intermediate file
        RealtimeLogger.info(
            "Concatenated {} files into intermediate file {}".format(
                len(file_ids), out_name))
        return context.write_intermediate_file(job, out_name)
    else:
        # Write to outstore under the given name.
        RealtimeLogger.info(
            "Concatenated {} files into output file {} -> {}".format(
                len(file_ids), out_name, dest_name))
        return context.write_output_file(job, out_name, dest_name)
Ejemplo n.º 11
0
    def calculate_features_for_atom(self, atom, only_aa=False, only_atom=False,
      non_geom_features=False, use_deepsite_features=False, warn_if_buried=False):
        if use_deepsite_features:
            features = self.get_deepsite_features(atom)
            if warn_if_buried:
                is_burried = self.get_accessible_surface_area(atom)[-2]
        elif only_atom:
            features = self.get_element_type(atom)
            if warn_if_buried:
                is_buried = self.get_accessible_surface_area(atom)[-2]
        elif only_aa:
            features = self.get_residue(atom)
            if warn_if_buried:
                is_buried = self.get_accessible_surface_area(atom)[-2]
        elif non_geom_features:
            features = np.zeros(13)
            features[0:5] = self.get_element_type(atom)
            features[5:9] = self.get_charge_and_electrostatics(atom)
            features[9:13] = self.get_hydrophobicity(atom)
            is_buried = self.get_accessible_surface_area(atom)[-2]
        else:
            features = np.empty(self.n_atom_features)

            features[0:13]  = self.get_atom_type(atom)
            features[13:18] = self.get_element_type(atom)
            features[18:19] = self.get_vdw(atom)
            features[19:26] = self.get_charge_and_electrostatics(atom)
            features[26:30] = self.get_concavity(atom)
            features[30:34] = self.get_hydrophobicity(atom)
            features[34:40] = self.get_accessible_surface_area(atom)
            features[40:61] = self.get_residue(atom)
            features[61:64] = self.get_ss(atom)
            features[64:70] = self.get_deepsite_features(atom, calc_charge=False,
                calc_conservation=False)
            features[70:73] = self.get_evolutionary_conservation_score(atom)

            is_buried = bool(features[35])

        RealtimeLogger.info("Finished atom {}".format(atom))

        self.atom_features[atom.serial_number-1] = features

        if warn_if_buried:
            return features, is_buried
        else:
            return features
Ejemplo n.º 12
0
def ensure_disk(job, job_fn, job_fn_args, job_fn_kwargs, file_id_list, factor=8, padding=1024 ** 3):
    """
    Ensure that the currently running job has enough disk to load all the given
    file IDs (passed as any number of lists of file IDs), and process them,
    producing factor times as much data, plus padding.
    
    Takes the job, the function that is the job, the list of arguments passed
    in (except the job object), the dict of keyword args passed in, and then
    a file ID list or iterable.
    
    If there is not enough disk, re-queues the job with more disk, and returns
    the promise for the result.
    
    If there is enough disk, returns None
    
    TODO: Convert to promised requirements if it is sufficiently expressive.
    """
    
    # We need to compute the total size of our inputs, expected intermediates,
    # and outputs, and re-queue ourselves if we don't have enough disk.
    required_disk = 0
    
    for file_id in file_id_list:
        # For each file in the collection
        # Say we need space for it
        required_disk += file_id.size
    
    
    # Multiply out for intermediates and results
    # TODO: Allow different factors for different file IDs
    # We only need to multiply e.g. BAM files, not indexes
    required_disk *= factor
   
    # Add some padding
    required_disk += padding
        
    if job.disk < required_disk:
        # Re-queue with more disk
        RealtimeLogger.info("Re-queueing job with {} bytes of disk; originally had {}".format(required_disk, job.disk))
        requeued = job.addChildJobFn(job_fn, *job_fn_args, cores=job.cores, memory=job.memory, disk=required_disk, **job_fn_kwargs)
        return requeued.rv()
    else:
        # Disk we have is OK
        return None
Ejemplo n.º 13
0
def run_split_gam_reads(job, context, gam_input_reads, gam_reads_file_id):
    """ split up an input reads file in GAM format
    """
    RealtimeLogger.info("Starting gam split")
    start_time = timeit.default_timer()

    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    gam_path = os.path.join(work_dir, os.path.basename(gam_input_reads))
    job.fileStore.readGlobalFile(gam_reads_file_id, gam_path)

    # Split up the gam into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    cmd = [
        'vg', 'chunk', '-a',
        os.path.basename(gam_path), '--gam-split-size',
        str(chunk_size), '--prefix', 'gam_reads_chunk'
    ]

    context.runner.call(job, cmd, work_dir=work_dir)

    gam_chunk_ids = []
    for chunk_name in os.listdir(work_dir):
        if chunk_name.endswith('.gam') and chunk_name.startswith(
                'gam_reads_chunk'):
            gam_chunk_ids.append(
                context.write_intermediate_file(
                    job, os.path.join(work_dir, chunk_name)))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info(
        "Split gam into {} chunks. Process took {} seconds.".format(
            len(gam_chunk_ids), run_time))

    return gam_chunk_ids
Ejemplo n.º 14
0
def create_data_loader(job, sfam_id, preemptable=True):
    """Create H5 for Molmimic3dCNN to read

    Note: move this somewhere else
    """
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]

    pdb_path = os.path.join(work_dir, "pdb")
    if not os.path.isdir(pdb_path):
        os.makedirs(pdb_path)

    id_format = re.compile(
        "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$")

    #Get all with keys same sfam, but do not download

    in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix))
    keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \
        if f.endswith(".pdb") and id_format.match(f)]

    pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily",
                            str(int(sfam_id)))
    clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id)))

    try:
        pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \
            for s in SeqIO.parse(clusters_file, "fasta")])
    except ValueError:
        RealtimeLogger.info(
            "Unable to create data loading file for {}.".format(sfam_id))
        return

    domains = pd.DataFrame({
        "pdb": pdb,
        "chain": chain,
        "domNo": domain,
        "sdi": sdi
    })

    data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id)))
    domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
Ejemplo n.º 15
0
    def write_output_file(self, local_path, output_path):
        """
        Write output to Azure. Will create the container if necessary.
        """

        self.__connect()

        RealtimeLogger.debug("Saving {} to AzureIOStore".format(output_path))

        try:
            # Make the container
            self.connection.create_container(self.container_name)
        except azure.WindowsAzureConflictError:
            # The container probably already exists
            pass

        # Upload the blob (synchronously)
        # TODO: catch no container error here, make the container, and retry
        self.connection.put_block_blob_from_path(
            self.container_name, self.name_prefix + output_path, local_path)
Ejemplo n.º 16
0
def run_split_reads_if_needed(job, context, fastq, gam_input_reads, bam_input_reads, reads_file_ids):
    """
    Return a list of lists of read chunk file IDs, one list per read files.
    
    If the workflow is in single_reads_chunk mode (according to
    context.options.single_read_chunk), produce one chunk per file.
    
    Otherwise, produce several chunks per file.
    """
    
    if not context.config.single_reads_chunk:
        reads_chunk_ids = job.addChildJobFn(run_split_reads, context, fastq, gam_input_reads, bam_input_reads,
                                            reads_file_ids,
                                            cores=context.config.misc_cores, memory=context.config.misc_mem,
                                            disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info("Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in reads_file_ids]
        
    return reads_chunk_ids
Ejemplo n.º 17
0
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved, xg_file_id, paths):
    """ split the fastq, then surject each chunk.  returns outputgams, paired with total surject time
    (excluding toil-vg overhead such as transferring and splitting files )"""

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    if not context.config.single_reads_chunk:
        reads_chunk_ids = child_job.addChildJobFn(run_split_reads, context, None, 'aln.gam', None,
                                                  [gam_input_reads_id],
                                                  cores=context.config.misc_cores, memory=context.config.misc_mem,
                                                  disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info("Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in [gam_input_reads_id]]

    return child_job.addFollowOnJobFn(run_whole_surject, context, reads_chunk_ids, output_name, 
                                      interleaved, xg_file_id, paths, cores=context.config.misc_cores,
                                      memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
Ejemplo n.º 18
0
def get_original_complexes(job,
                           mol_sfam,
                           int_sfam,
                           group,
                           int_type,
                           work_dir=None):
    if work_dir is None:
        work_dir = os.cwd()

    complex_files = []
    for _, row in group:
        row = row.iloc[0]
        RealtimeLogger.info("ROW: {}".format(row))

        mol_file, mol_resi, int_file, int_resi = process_interface(
            job, row, int_type, work_dir=work_dir)
        # try:
        #     mol_file = download_pdb(job, row.mol_superfam_id, row.mol_pdb,
        #         row.mol_chain, row.mol_sdi_id, row.mol_domNo, work_dir=work_dir)
        #
        #     int_file = download_pdb(job, row.int_superfam_id, row.int_pdb,
        #         row.int_chain, row.int_sdi_id, row.int_domNo, work_dir=work_dir)
        # except (KeyboardInterrupt, SystemExit):
        #     raise
        # except Exception as e:
        if mol_file is None or int_file is None:
            #PDB files not found, skip
            RealtimeLogger.info(
                "Cannot download PDB {}.{}.{} bc it was not found".format(
                    row.mol_pdb, row.mol_chain, row.mol_sdi_id))
            complex_files.append(None)
            continue

        merged_file = next(
            prep((mol_file, "M"), (int_file, "I"),
                 merge=True,
                 work_dir=work_dir))

        complex_files.append(merged_file)

    return complex_files
Ejemplo n.º 19
0
def run_split_bam_reads(job, context, bam_input_reads, bam_reads_file_id):
    """ split up an input reads file in BAM format
    """
    RealtimeLogger.info("Starting bam split")
    start_time = timeit.default_timer()
    
    # Define work directory for docker calls
    work_dir = job.fileStore.getLocalTempDir()

    # We need the sample fastq for alignment
    bam_path = os.path.join(work_dir, os.path.basename(bam_input_reads))
    job.fileStore.readGlobalFile(bam_reads_file_id, bam_path)

    # Split up the bam into chunks

    # Make sure chunk size even in case paired interleaved
    chunk_size = context.config.reads_per_chunk
    if chunk_size % 2 != 0:
        chunk_size += 1

    # 1 line per read
    chunk_lines = chunk_size * 1

    cmd = [['samtools', 'view', os.path.basename(bam_path)]]
    cmd.append(['split', '-l', str(chunk_lines),
                '--filter=bash -c \'cat <(samtools view -H {}) <(cat -)'.format(os.path.basename(bam_path)) +
                ' | samtools view -O BAM --threads {} -'.format(max(1, int(context.config.fq_split_cores) - 1)) +
                ' > $FILE.bam\'', '-', 'bam_reads_chunk.'])

    context.runner.call(job, cmd, work_dir = work_dir)

    bam_chunk_ids = []
    for chunk_name in sorted(os.listdir(work_dir)):
        if chunk_name.endswith('.bam') and chunk_name.startswith('bam_reads_chunk'):
            bam_chunk_ids.append(context.write_intermediate_file(job, os.path.join(work_dir, chunk_name)))
        
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info("Split bam into {} chunks. Process took {} seconds.".format(len(bam_chunk_ids), run_time))

    return bam_chunk_ids
Ejemplo n.º 20
0
def up(job, inputFileID1, inputFileID2, options, memory=sortMemory):
    """
    Merges the two files and places them in the output.
    """
    with job.fileStore.writeGlobalFileStream() as (fileHandle,
                                                   outputFileStoreID):
        fileHandle = codecs.getwriter('utf-8')(fileHandle)
        with job.fileStore.readGlobalFileStream(
                inputFileID1) as inputFileHandle1:
            inputFileHandle1 = codecs.getreader('utf-8')(inputFileHandle1)
            with job.fileStore.readGlobalFileStream(
                    inputFileID2) as inputFileHandle2:
                inputFileHandle2 = codecs.getreader('utf-8')(inputFileHandle2)
                RealtimeLogger.info(
                    "Merging %s and %s to %s" %
                    (inputFileID1, inputFileID2, outputFileStoreID))
                merge(inputFileHandle1, inputFileHandle2, fileHandle)
        # Cleanup up the input files - these deletes will occur after the completion is successful.
        job.fileStore.deleteGlobalFile(inputFileID1)
        job.fileStore.deleteGlobalFile(inputFileID2)
        return outputFileStoreID
Ejemplo n.º 21
0
def copy_everything(job, options):
    """
    Download the file list and copy all the files.
    
    """

    # Set up the IO stores.
    in_store = IOStore.get(options.in_store)
    out_store = IOStore.get(options.out_store)

    batch_count = 0

    # List all the files.
    blobs_iterator = in_store.list_input_directory("", recursive=True)

    # Make an iterator that filters them
    filtered_iterator = (x for x in blobs_iterator
                         if fnmatch.fnmatchcase(x, options.pattern))

    # Batch them up
    for batch in group(filtered_iterator, options.batch_size):

        # For every batch, strip out any Nones that got put in when grouping
        batch = [x for x in batch if x is not None]

        # Copy everything in that batch
        job.addChildJobFn(copy_batch,
                          options,
                          batch,
                          cores=1,
                          memory="1G",
                          disk="10G")

        batch_count += 1

        if batch_count % 10 == 0:

            RealtimeLogger.info("Queued {} batches...".format(batch_count))

    RealtimeLogger.info("Queued {} total batches".format(batch_count))
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    RealtimeLogger.info("Running table {}".format(table))

    #Read in H5 for entire table
    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store)
    tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath)

    sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table),
        columns=["nbr_superfam_id"]).drop_duplicates().dropna()
    skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \
        out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \
        not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))])

    # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
    #    if f.endswith(".inferred_interactome")])

    sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)]
    sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist()

    # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \
    #     out_store.list_input_directory(
    #         "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \
    #         sfam=sfam, table=table)) if not k.endswith("failed"))

    #sfams = list(set(sfams)-partial_sfams)

    if len(sfams) > 0:
        map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)

    try:
        os.remove(tableInfPath)
    except OSError:
        pass
Ejemplo n.º 23
0
def main_job(job, options, sam_urls):
    """
    Root job of the Toil workflow. Download the sample URLs.
    
    Returns a Directory containing a bunch of output files.
    
    """

    RealtimeLogger.info("Main job starting")
    RealtimeLogger.info("Temp directory location: {}".format(
        job.fileStore.getLocalTempDir()))

    # Make sure we can use samtools
    options.drunner.call(job, [["samtools", "--version"]])

    # We'll fill this with promises for subdirectories by sample filename
    subdir_promises = {}

    for sam_url in sam_urls:
        # Work out the base filename
        sam_filename = os.path.basename(urlparse.urlparse(sam_url).path)

        # Go download and convert the reads, and stick the FASTQs in a directory
        subdir_promises[sam_filename] = ToilPromise.wrap(
            job.addChildJobFn(extract_job,
                              options,
                              sam_url,
                              cores=1,
                              memory="16G",
                              disk="500G")).then(
                                  lambda (fq1, fq2): Directory({
                                      "fq1.fastq": fq1,
                                      "fq2.fastq": fq2
                                  }))

    # Mount each subdirectory under its original sam/bam/cram filename
    return ToilPromise.all(subdir_promises).then(
        lambda dirs: Directory().mount_all(dirs)).unwrap_result()
Ejemplo n.º 24
0
def poll(job, options, file_id, number, cores=0.1, disk='200M', memory='512M'):

    # Wait a random amount of time before grabbing the file for others to cache it
    time.sleep(random.randint(options.minSleep, options.minSleep + 10))

    # Read the file. Don't accept a symlink because then we might just have the
    # filestore's copy, even if caching is not happening.
    local_file = job.fileStore.readGlobalFile(file_id, cache=True, mutable=False, symlink=False)
    
    # Wait a random amount of after before grabbing the file for others to use it
    time.sleep(random.randint(options.minSleep, options.minSleep + 10))
    
    # Stat the file (reads through links)
    stats = os.stat(local_file)
    
    # Check what machine we are
    hostname = socket.gethostname()
    
    RealtimeLogger.info('Job {} on host {} sees file at device {} inode {}'.format(number, hostname, stats.st_dev, stats.st_ino))
    
    # Return a tuple representing our view of the file.
    # Drop hostname since hostnames are unique per pod.
    return (stats.st_dev, stats.st_ino)
Ejemplo n.º 25
0
    def new_function(*args, **kwargs):
        # Call backoff times, overriding parameters with stuff from kwargs
        for delay in backoff_times(retries=kwargs.get("retries", retries),
                                   base_delay=kwargs.get(
                                       "base_delay", base_delay)):
            # Keep looping until it works or our iterator raises a
            # BackoffError
            if delay > 0:
                # We have to wait before trying again
                RealtimeLogger.error("Retry after {} seconds".format(delay))
                time.sleep(delay)
            try:
                return original_function(*args, **kwargs)
            except:
                # Report the formatted underlying exception with traceback
                RealtimeLogger.error("{} failed due to: {}".format(
                    original_function.__name__,
                    "".join(traceback.format_exception(*sys.exc_info()))))

        # If we get here, the function we're calling never ran through before we
        # ran out of backoff times. Give an error.
        raise BackoffError("Ran out of retries calling {}".format(
            original_function.__name__))
Ejemplo n.º 26
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from the filesystem.
        """

        RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format(
            input_path, self.path_prefix, local_path))

        if os.path.exists(local_path):
            # Try deleting the existing item if it already exists
            try:
                os.unlink(local_path)
            except:
                # Don't fail here, fail complaining about the assertion, which
                # will be more informative.
                pass

        # Make sure the path is clear for copying
        assert (not os.path.exists(local_path))

        # Where is the file actually?
        real_path = os.path.abspath(os.path.join(self.path_prefix, input_path))

        if not os.path.exists(real_path):
            RealtimeLogger.error(
                "Can't find {} from FileIOStore in {}!".format(
                    input_path, self.path_prefix))
            raise RuntimeError("File {} missing!".format(real_path))

        # Make a temporary file
        temp_handle, temp_path = tempfile.mkstemp(
            dir=os.path.dirname(local_path))
        os.close(temp_handle)

        # Copy to the temp file
        shutil.copy2(real_path, temp_path)

        # Rename the temp file to the right place, atomically
        RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path))
        os.rename(temp_path, local_path)

        # Look at the file stats
        file_stats = os.stat(real_path)

        if (file_stats.st_uid == os.getuid()
                and file_stats.st_mode & stat.S_IWUSR):
            # We own this file and can write to it. We don't want the user
            # script messing it up through the symlink.

            try:
                # Clear the user write bit, so the user can't accidentally
                # clobber the file in the actual store through the symlink.
                os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR)
            except OSError:
                # If something goes wrong here (like us not having permission to
                # change permissions), ignore it.
                pass
Ejemplo n.º 27
0
def extract_job(job, options, sam_url):
    """
    Extract and fix up the given SAM/BAM/CRAM reads by URL.
    
    Return a pair of FASTQ file IDs.
    """

    # We have to deal with relative paths relative to here if we want Docker to
    # work right
    work_dir = job.fileStore.getLocalTempDir()

    # Let's just download the whole bam
    sorted_bam = "sorted.bam"

    # We need a prefix for temp files
    temp_prefix = sorted_bam + ".part"

    RealtimeLogger.info("Sort {} to BAM".format(sam_url))

    # Sort reads by name to a BAM file. If we don't give a temp file prefix it
    # tries to write the temp files back to the FTP.
    options.drunner.call(job, [[
        "samtools", "sort", "-n", "-o", sorted_bam, "-T", temp_prefix, sam_url
    ]],
                         work_dir=work_dir)

    # Save to file store
    bam_id = job.fileStore.writeGlobalFile(os.path.join(work_dir, sorted_bam))

    # Convert and return FASTQs
    return job.addChildJobFn(convert_job,
                             options,
                             sam_url,
                             bam_id,
                             cores=4,
                             memory="16G",
                             disk="500G").rv()
Ejemplo n.º 28
0
def down(job, inputFileStoreID, N, path, downCheckpoints, options, memory=sortMemory):
    """
    Input is a file, a subdivision size N, and a path in the hierarchy of jobs.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    
    RealtimeLogger.info("Down job starting: %s" % path)
    
    # Read the file
    inputFile = job.fileStore.readGlobalFile(inputFileStoreID, cache=False)
    length = os.path.getsize(inputFile)
    if length > N:
        # We will subdivide the file
        RealtimeLogger.critical("Splitting file: %s of size: %s"
                % (inputFileStoreID, length))
        # Split the file into two copies
        midPoint = getMidPoint(inputFile, 0, length)
        t1 = job.fileStore.getLocalTempFile()
        with open(t1, 'w') as fH:
            fH.write(copySubRangeOfFile(inputFile, 0, midPoint+1))
        t2 = job.fileStore.getLocalTempFile()
        with open(t2, 'w') as fH:
            fH.write(copySubRangeOfFile(inputFile, midPoint+1, length))
        # Call down recursively. By giving the rv() of the two jobs as inputs to the follow-on job, up,
        # we communicate the dependency without hindering concurrency.
        result = job.addFollowOnJobFn(up,
                                    job.addChildJobFn(down, job.fileStore.writeGlobalFile(t1), N, path + '/0',
                                                      downCheckpoints, checkpoint=downCheckpoints, options=options,
                                                      preemptable=True, memory=options.sortMemory).rv(),
                                    job.addChildJobFn(down, job.fileStore.writeGlobalFile(t2), N, path + '/1',
                                                      downCheckpoints, checkpoint=downCheckpoints, options=options,
                                                      preemptable=True, memory=options.mergeMemory).rv(),
                                    path + '/up', preemptable=True, options=options, memory=options.sortMemory).rv()
    else:
        # We can sort this bit of the file
        RealtimeLogger.critical("Sorting file: %s of size: %s"
                % (inputFileStoreID, length))
        # Sort the copy and write back to the fileStore
        shutil.copyfile(inputFile, inputFile + '.sort')
        sort(inputFile + '.sort')
        result = job.fileStore.writeGlobalFile(inputFile + '.sort')
        
    RealtimeLogger.info("Down job finished: %s" % path)
    return result
Ejemplo n.º 29
0
 def _runMainLoop(self, rootJob):
     """
     Runs the main loop with the given job.
     :param toil.job.Job rootJob: The root job for the workflow.
     :rtype: Any
     """
     with RealtimeLogger(self._batchSystem, 
                         level=self.options.logLevel if self.options.realTimeLogging else None):
         # FIXME: common should not import from leader
         from toil.leader import mainLoop
         return mainLoop(config=self.config,
                         batchSystem=self._batchSystem,
                         provisioner=None,
                         jobStore=self._jobStore,
                         rootJobWrapper=rootJob,
                         jobCache=self._jobCache)
Ejemplo n.º 30
0
def run_id_increment(job, options, graph_i, graph_id, distance):
    """
    Actually do the ID incrementing. Is a separate, toil-vg-style job so it
    can be added to toil-vg and so we can set the correct resource requirements.
    
    """

    RealtimeLogger.info("Starting graph shift...")
    start_time = timeit.default_timer()

    work_dir = job.fileStore.getLocalTempDir()

    # download graph
    graph_filename = os.path.join(work_dir,
                                  '{}.vg'.format(options.chroms[graph_i]))
    toil_vg.vg_common.read_from_store(job, options, graph_id, graph_filename)

    # Output
    output_graph_filename = graph_filename + '.shifted.vg'

    RealtimeLogger.info("Moving {} up by {} to {}".format(
        graph_filename, distance, output_graph_filename))

    with open(output_graph_filename, "w") as out_file:
        command = [
            'vg', 'ids', '--increment',
            str(distance),
            os.path.basename(graph_filename)
        ]
        options.drunner.call(job, command, work_dir=work_dir, outfile=out_file)

    # Back to store
    output_graph_id = toil_vg.vg_common.write_to_store(job, options,
                                                       output_graph_filename)

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    RealtimeLogger.info(
        "Finished graph shift. Process took {} seconds.".format(run_time))

    return output_graph_id
Ejemplo n.º 31
0
 def run(self, fileStore):
     RealtimeLogger.info('This should be logged at info level')
     RealtimeLogger.debug('This should be logged at debug level')