Example #1
0
 def test_bam_quickcheck(self):
     from toil_lib.validators import bam_quickcheck
     good_bam_url = 's3://cgl-pipeline-inputs/exome/ci/chr6.normal.bam'
     bad_bam_url = 's3://cgl-pipeline-inputs/exome/ci/truncated.bam'
     good_bam_job = Job.wrapJobFn(download_url, url=good_bam_url, name='good.bam', work_dir=self.tmpdir)
     with Toil(self.options) as toil:
         toil.start(good_bam_job)
     bad_bam_job = Job.wrapJobFn(download_url, url=bad_bam_url, name='bad.bam', work_dir=self.tmpdir)
     with Toil(self.options) as toil:
         toil.start(bad_bam_job)
     assert bam_quickcheck(os.path.join(self.tmpdir, 'good.bam'))
     assert bam_quickcheck(os.path.join(self.tmpdir, 'bad.bam')) == False
Example #2
0
def runCactusBlast(sequenceFiles,
                   alignmentsFile,
                   toilDir,
                   chunkSize=None,
                   overlapSize=None,
                   logLevel=None,
                   compressFiles=None,
                   lastzMemory=None,
                   targetSequenceFiles=None):

    options = Job.Runner.getDefaultOptions(toilDir)
    options.logLevel = "CRITICAL"
    blastOptions = BlastOptions(chunkSize=chunkSize,
                                overlapSize=overlapSize,
                                compressFiles=compressFiles,
                                memory=lastzMemory)
    with Toil(options) as toil:
        seqIDs = [
            toil.importFile(makeURL(seqFile)) for seqFile in sequenceFiles
        ]

        if targetSequenceFiles:
            targetSeqIDs = [
                toil.importFile(makeURL(seqFile))
                for seqFile in targetSequenceFiles
            ]
            rootJob = BlastSequencesAgainstEachOther(
                sequenceFileIDs1=seqIDs,
                sequenceFileIDs2=targetSeqIDs,
                blastOptions=blastOptions)
        else:
            rootJob = BlastSequencesAllAgainstAll(seqIDs, blastOptions)
        alignmentsID = toil.start(rootJob)
        toil.exportFile(alignmentsID, makeURL(alignmentsFile))
    def _importExportFile(self, options, fail):
        with Toil(options) as toil:
            if not options.restart:

                srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4())
                with open(srcFile, 'w') as f:
                    f.write('Hello')
                inputFileID = toil.importFile('file://' + srcFile)

                # Write a boolean that determines whether the job fails.
                with toil._jobStore.writeFileStream() as (f, failFileID):
                    self.failFileID = failFileID
                    f.write(str(fail))

                outputFileID = toil.start(
                    HelloWorld(inputFileID, self.failFileID))
            else:
                # Set up job for failure
                with toil._jobStore.updateFileStream(self.failFileID) as f:
                    f.write('False')

                outputFileID = toil.restart()

            toil.exportFile(outputFileID, 'file://' + self.dstFile)
            with open(self.dstFile, 'r') as f:
                assert f.read() == "HelloWorld!"
def start_toil(dataset_name, options, use_data=False):
    if use_data:
        data = Job.wrapJobFn(download_data.start_toil).encapsulate()
        mmdb2pdb = data.addFollowOnJobFn(
            convert_mmdb_to_pdb.start_toil).encapsulate()
    else:
        mmdb2pdb = Job.wrapJobFn(convert_mmdb_to_pdb.start_toil).encapsulate()

    interactome = mmdb2pdb.addChildJobFn(get_structural_interactome.start_toil,
                                         dataset_name).encapsulate()
    bsa = interactome.addFollowOnJobFn(calculate_bsa.start_toil,
                                       dataset_name).encapsulate()

    prep_protein = mmdb2pdb.addChildJobFn(prepare_protein.start_toil,
                                          dataset_name).encapsulate()
    features = mmdb2pdb.addFollowOnJobFn(calculate_features.start_toil,
                                         dataset_name,
                                         name="features").encapsulate()

    filter = mmdb2pdb.addFollowOnJobFn(filter_dataset.start_toil,
                                       dataset_name,
                                       name="filter").encapsulate()

    with Toil(options) as toil:
        toil.start(mmdb2pdb if not use_data else data)
Example #5
0
    def testImportExportFilePermissions(self):
        """
        Ensures that uploaded files preserve their file permissions when they
        are downloaded again. This function checks that an imported executable file
        maintains its executability after being exported.
        """
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        with Toil(options) as toil:
            for executable in True, False:
                srcFile = '%s/%s%s' % (self._tempDir, 'in', str(uuid.uuid4()))
                with open(srcFile, 'w') as f:
                    f.write('Hello')

                if executable:
                    # Add file owner execute permissions
                    os.chmod(srcFile, os.stat(srcFile).st_mode | stat.S_IXUSR)

                # Current file owner execute permissions
                initialPermissions = os.stat(srcFile).st_mode & stat.S_IXUSR
                fileID = toil.importFile('file://' + srcFile)
                toil.exportFile(fileID, 'file://' + self.dstFile)
                currentPermissions = os.stat(
                    self.dstFile).st_mode & stat.S_IXUSR

                assert initialPermissions == currentPermissions
Example #6
0
def main(options=None):
    if not options:
        parser = ArgumentParser()
        Job.Runner.addToilOptions(parser)
        parser.add_argument('--pathGPX',
                            help='The absolute path where all GPX file are.')
        parser.add_argument(
            '--pathNPY',
            help=
            'A npy file (np.array stored) with the path of each file to be analysed'
        )
        #        parser.add_argument("--O",  help="Output destination path ",default=defaultOutPath)

        options = parser.parse_args()

    #some checks
    npyFile = options.pathNPY
    if not os.path.exists(npyFile):
        print("the npy file [fileNameGPX1,fileNameGPX2,...] does not exists.")
        exit()

    try:
        codes = np.load(npyFile)
        if len(codes) == 0:
            raise RuntimeError("Invalid values of npy file: %s" %
                               options.pathNPY)
    except:
        raise RuntimeError("Invalid format of npy file: %s" % options.pathNPY)

    #Run workflow
    with Toil(options) as workflow:
        if not workflow.options.restart:
            workflow.start(Job.wrapJobFn(generateNetwork, options=options))
        else:
            workflow.restart()
Example #7
0
def run_toil(options):
    """Toil implementation for cgpCaveman."""
    setup = StepRunner(process="setup", options=options)
    split = Split(options=options)
    remove = RemoveContigs(options=options)
    concat = StepRunner(process="split_concat", options=options)
    mstep = SplitRunner(process="mstep", options=options)
    merge = StepRunner(process="merge", options=options)
    estep = SplitRunner(process="estep", options=options)
    results = StepRunner(process="merge_results", options=options)
    add_ids = StepRunner(process="add_ids", options=options)
    flag = StepRunner(process="flag", options=options, runtime=None)

    # build dag
    setup.addFollowOn(split)
    split.addFollowOn(remove)
    remove.addFollowOn(concat)
    concat.addFollowOn(mstep)
    mstep.addFollowOn(merge)
    merge.addFollowOn(estep)
    estep.addFollowOn(results)
    results.addFollowOn(add_ids)
    add_ids.addFollowOn(flag)

    with Toil(options) as pipe:
        if not pipe.options.restart:
            pipe.start(setup)
        else:
            pipe.restart()
Example #8
0
def main():
    # Establish session
    session = boto3.session.Session()
    s3 = session.resource('s3')

    # Grab objects from upload bucket to not download duplicates
    upload_bucket_name = 'jvivian-ccle-data'
    upload_bucket = s3.Bucket(upload_bucket_name)
    processed_keys = set([obj.key for obj in upload_bucket.objects.all()])

    # Collect all keys to be processed
    download_bucket_name = 'cgl-ccle-data'
    download_bucket = s3.Bucket(download_bucket_name)
    keys = [
        x.key for x in download_bucket.objects.all() if x not in processed_keys
    ]
    keys = [
        x for x in keys if not x.startswith('output') and x.endswith('.tar.gz')
    ]

    # Start Toil run
    parser = Job.Runner.getDefaultArgumentParser()
    options = parser.parse_args()
    with Toil(options) as toil:
        if not toil.options.restart:
            toil.start(
                Job.wrapJobFn(map_job, workflow, keys, download_bucket_name,
                              upload_bucket_name))
        else:
            toil.restart()
Example #9
0
def runCactusBlastIngroupsAndOutgroups(ingroups, outgroups, alignmentsFile, toilDir, outgroupFragmentPaths=None, ingroupCoveragePaths=None, chunkSize=None, overlapSize=None, 
                   logLevel=None,
                   compressFiles=None,
                   lastzMemory=None):
    options = Job.Runner.getDefaultOptions(toilDir)
    options.disableCaching = True
    options.logLevel = "CRITICAL"
    blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize,
                                compressFiles=compressFiles,
                                memory=lastzMemory)
    with Toil(options) as toil:
        ingroupIDs = [toil.importFile(makeURL(ingroup)) for ingroup in ingroups]
        outgroupIDs = [toil.importFile(makeURL(outgroup)) for outgroup in outgroups]
        rootJob = BlastIngroupsAndOutgroups(blastOptions, ingroupIDs, outgroupIDs)
        blastResults = toil.start(rootJob)
        alignmentsID = blastResults[0]
        toil.exportFile(alignmentsID, makeURL(alignmentsFile))
        outgroupFragmentIDs = blastResults[1]
        ingroupCoverageIDs = blastResults[2]

        if outgroupFragmentPaths:
            assert len(outgroupFragmentIDs) == len(outgroupFragmentPaths)
            for outgroupFragmentID, outgroupFragmentPath in zip(outgroupFragmentIDs, outgroupFragmentPaths):
                toil.exportFile(outgroupFragmentID, makeURL(outgroupFragmentPath))
        if ingroupCoveragePaths:
            assert len(ingroupCoverageIDs) == len(ingroupCoveragePaths)
            for ingroupCoverageID, ingroupCoveragePath in zip(ingroupCoverageIDs, ingroupCoveragePaths):
                toil.exportFile(ingroupCoverageID, makeURL(ingroupCoveragePath))
def main():
    # if I wanted to make this into a true command line tool, I'd fill out the parser.
    # Instead, I'm just going to add the bare minimum for making a workflow. 
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    # parser.add_argument(
    #     'output_dir', help='The dir to save the output, target bedfiles.', type=str)
    options = parser.parse_args()

    assembly_dir = "./asms/"
    output_dir = "./liftovers/"
    assembly_files = {"HG03098_paf_chr21": assembly_dir + "HG03098_paf_chr21.fa", "HG03492_paf_chr21": assembly_dir + "HG03492_paf_chr21.fa", "hg38_chr21": assembly_dir + "hg38_chr21.fa"}
    hal_file = "ref_based_small_chr21.hal"

    with Toil(options) as workflow:
        if not workflow.options.restart:
            #importing files:
            for asm, asm_file in assembly_files.items():
                assembly_files[asm] = workflow.importFile("file://" + os.path.abspath(asm_file))
            
            hal_file = workflow.importFile("file://" + os.path.abspath(hal_file))
                
            #todo: update here, for running not in cactus_connectivity, need assembly_lengths, possibly other things.
            liftovers = workflow.start(Job.wrapJobFn(all_to_all_liftovers, assembly_files, hal_file, output_dir))
            # liftovers = workflow.start(Job.wrapJobFn(all_to_all_liftovers, assembly_files, hal_file, output_dir, cores=3))

            for target_asm, liftovers_dict in liftovers.items():
                for source_asm, liftover_file in liftovers_dict.items():
                    workflow.exportFile(liftover_file, 'file://' + os.path.abspath(output_dir) + "/" + source_asm + "_source_" + target_asm + "_target_liftover.bed")

        else:
            output = workflow.restart()
Example #11
0
def start_toil(option, use_data=False):
    from molmimic.generate_data import download_data
    from molmimic.generate_data import convert_mmdb_to_pdb
    from molmimic.generate_data import get_structural_interactome
    from molmimic.generate_data import calculate_bsa
    from molmimic.generate_data import prepare_protein
    from molmimic.generate_data import calculate_features
    from molmimic.generate_data import filter_dataset

    if use_data:
        data = Job.wrapJobFn(download_data.start_toil).encapsulate()
        mmdb2pdb = data.addFollowOnJobFn(convert_mmdb_to_pdb.start_toil).encapsulate()
    else:
        mmdb2pdb = Job.wrapJobFn(convert_mmdb_to_pdb.start_toil).encapsulate()

    interactome = mmdb2pdb.addChildJobFn(get_structural_interactome.start_toil).encapsulate()
    bsa = interactome.addFollowOnJobFn(calculate_bsa.start_toil).encapsulate()

    prep_protein = mmdb2pdb.addChildJobFn(prepare_protein.start_toil).encapsulate()
    features = mmdb2pdb.addFollowOnJobFn(calculate_features.start_toil).encapsulate()

    filter = mmdb2pdb.addFollowOnJobFn(filter_dataset.start_toil).encapsulate()

    with Toil(options) as toil:
        toil.start(mmdb2pdb if not use_data else data)
Example #12
0
def runCactusPreprocessor(outputSequenceDir, configFile, inputSequences,
                          toilDir):
    toilOptions = Job.Runner.getDefaultOptions(toilDir)
    toilOptions.logLevel = "INFO"
    toilOptions.disableCaching = True
    with Toil(toilOptions) as toil:
        stageWorkflow(outputSequenceDir, configFile, inputSequences, toil)
Example #13
0
def main():
    opts = parse_args()
    with Toil(opts) as toil:
        if opts.restart:
            outfile_ids, fasta_ids, basenames = toil.restart()
        else:
            input_ids = []
            input_types = []
            input_basenames = []
            for input_sequence in opts.input_sequences:
                input_sequence_id = toil.importFile(makeURL(input_sequence))
                if input_sequence.endswith(".gz") or input_sequence.endswith(
                        ".gzip"):
                    type = "gzip"
                else:
                    type = "fasta"
                input_ids.append(input_sequence_id)
                input_types.append(type)

                basename = os.path.basename(input_sequence)
                if basename in input_basenames:
                    raise RuntimeError("Inputs must have unique filenames.")
                input_basenames.append(basename)
            outfile_ids, fasta_ids, basenames = toil.start(
                Job.wrapJobFn(launch_parallel, input_ids, input_types,
                              input_basenames, opts))
        for outfile_id, fasta_id, basename in zip(outfile_ids, fasta_ids,
                                                  basenames):
            toil.exportFile(
                fasta_id,
                makeURL(os.path.join(opts.output_path, basename + '.masked')))
            toil.exportFile(
                outfile_id,
                makeURL(os.path.join(opts.output_path, basename + '.out')))
def align_transcripts(args, toil_options):
    """
    Main entry function for transcript alignment toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta)
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path)
            input_file_ids.modes = {}
            file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp,
                        input_file_ids.ref_db]
            for mode in args.transcript_modes:
                input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp'])
                file_ids.append(input_file_ids.modes[mode])
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage)
            results_file_ids = t.start(job)
        else:
            results_file_ids = t.restart()
        for file_path, file_id in results_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(file_path)
            t.exportFile(file_id, 'file://' + file_path)
Example #15
0
    def _importExportFile(self, options, fail):
        with Toil(options) as toil:
            if not options.restart:

                srcFile = '%s/%s%s' % (self._tempDir, 'in', uuid.uuid4())
                with open(srcFile, 'w') as f:
                    f.write('Hello')
                inputFileID = toil.importFile('file://' + srcFile)
                # Make sure that importFile returns the fileID wrapper
                self.assertIsInstance(inputFileID, FileID)
                self.assertEqual(os.stat(srcFile).st_size, inputFileID.size)

                # Write a boolean that determines whether the job fails.
                with toil._jobStore.writeFileStream() as (f, failFileID):
                    self.failFileID = failFileID
                    f.write(str(fail).encode('utf-8'))

                outputFileID = toil.start(
                    RestartingJob(inputFileID, self.failFileID))
            else:
                # Set up job for failure
                with toil._jobStore.updateFileStream(self.failFileID) as f:
                    f.write('False'.encode('utf-8'))

                outputFileID = toil.restart()

            toil.exportFile(outputFileID, 'file://' + self.dstFile)
            with open(self.dstFile, 'r') as f:
                assert f.read() == "HelloWorld!"
Example #16
0
    def _importExportFile(self, options, fail):
        with Toil(options) as toil:
            if not options.restart:

                srcFile = '%s/%s%s' % (self._tempDir, 'in', str(uuid.uuid4()))
                with open(srcFile, 'w') as f:
                    f.write('Hello')
                inputFileID = toil.importFile('file://' + srcFile)
                # Make sure that importFile returns the fileID wrapper
                self.assertIsInstance(inputFileID, FileID)
                self.assertEqual(os.stat(srcFile).st_size, inputFileID.size)

                # Write a boolean that determines whether the job fails.
                failFilePath = '%s/%s%s' % (self._tempDir, 'failfile', str(uuid.uuid4()))
                with open(failFilePath, 'wb') as f:
                    f.write(str(fail).encode('utf-8'))
                self.failFileID = toil.importFile('file://' + failFilePath)

                outputFileID = toil.start(RestartingJob(inputFileID, self.failFileID))
            else:
                # Set up job for failure
                # TODO: We're hackily updating this file without using the
                # correct FileStore interface. User code should not do this!
                with toil._jobStore.updateFileStream(self.failFileID) as f:
                    f.write('False'.encode('utf-8'))

                outputFileID = toil.restart()

            toil.exportFile(outputFileID, 'file://' + self.dstFile)
            with open(self.dstFile, 'r') as f:
                assert f.read() == "HelloWorld!"
Example #17
0
def main():

    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    opts = parse_args()
    opts.hubDir = os.path.abspath(opts.hubDir)
    opts.hals = [os.path.abspath(hal) for hal in opts.hals]
    if opts.batchSystem != 'singleMachine':
        raise RuntimeError("singleMachine is the only supported batchSystem")

    # Create labels for the HALs if none were provided
    if opts.labels is None:
        opts.labels = [os.path.basename(hal) for hal in opts.hals]
    if len(opts.labels) != len(opts.hals):
        raise ValueError(
            "%d labels were provided, but %d hals were provided." %
            (len(opts.labels), len(opts.hals)))

    # Ensure that the hals have some genomes in common, and take the
    # common genomes to display in the hub.
    genomess = [getGenomesInHal(hal) for hal in opts.hals]
    genomes = reduce(lambda a, i: a.intersection(i), genomess)
    if len(genomes) == 0:
        raise ValueError("No genomes in common between the HALs.")

    with Toil(opts) as toil:
        toil.start(Job.wrapJobFn(createHub, genomes, opts))
Example #18
0
def main():
    # if I wanted to make this into a true command line tool, I'd fill out the parser.
    # Instead, I'm just going to add the bare minimum for making a workflow.
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    # parser.add_argument(
    #     'output_dir', help='The dir to save the output, target bedfiles.', type=str)
    options = parser.parse_args()
    with Toil(options) as workflow:
        if not workflow.options.restart:
            # #importing files:
            # for asm, asm_file in assembly_files.items():
            #     assembly_files[asm] = workflow.importFile("file://" + os.path.abspath(asm_file))

            # hal_file = workflow.importFile("file://" + os.path.abspath(hal_file))

            liftovers = workflow.start(
                Job.wrapJobFn(calculate_bases_unmapped, assembly_files,
                              hal_file, output_dir))

            for target_asm, liftovers_dict in liftovers.items():
                for source_asm, liftover_file in liftovers_dict.items():
                    workflow.exportFile(
                        liftover_file, 'file://' +
                        os.path.abspath(output_dir) + "/" + source_asm +
                        "_source_" + target_asm + "_target_liftover.bed")

        else:
            output = workflow.restart()
Example #19
0
def main():
    parser = Job.Runner.getDefaultArgumentParser()
    parser.add_argument('packageFile')
    parser.add_argument('--outputDir', required=False,
            default=get_default_outputDir())
    parser.add_argument('--dockerDefer', required=False,
            default='RM')
    options = parser.parse_args()

    options.disableCaching = True

    with Toil(options) as toil:
        toil.config.dockerDefer = globals()[options.dockerDefer.upper()]
        toil._jobStore.writeConfig()

        if not toil.options.restart:
            storage = start(toil)
        else:
            storage = restart(toil)

        try:
            os.makedirs(options.outputDir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        storage.exportLocalFiles(toil, options.outputDir)
def augustus_pb(args, toil_options):
    """
    Main entry function for AugustusPB toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    :return:
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(
                t, args.genome_fasta)
            input_file_ids.chrom_sizes = FileID.forPath(
                t.importFile('file://' + args.chrom_sizes), args.chrom_sizes)
            input_file_ids.pb_cfg = FileID.forPath(
                t.importFile('file://' + args.pb_cfg), args.pb_cfg)
            input_file_ids.hints_gff = FileID.forPath(
                t.importFile('file://' + args.hints_gff), args.hints_gff)
            job = Job.wrapJobFn(setup,
                                args,
                                input_file_ids,
                                memory='16G',
                                disk='32G')
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job)
        else:
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf)
        t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf)
        t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf)
        t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
    def runToilPipeline(self, alignmentsFile, alpha=0.001):
        # Tests the toil pipeline
        options = Job.Runner.getDefaultOptions(
            os.path.join(self.tempDir, "toil"))
        options.logLevel = self.logLevelString

        with Toil(options) as toil:
            # Import the input file into the job store
            inputAlignmentFileID = toil.importFile(makeURL(alignmentsFile))

            rootJob = Job.wrapJobFn(mappingQualityRescoring,
                                    inputAlignmentFileID,
                                    minimumMapQValue=0,
                                    maxAlignmentsPerSite=1,
                                    alpha=alpha,
                                    logLevel=self.logLevelString)

            primaryOutputAlignmentsFileID, secondaryOutputAlignmentsFileID = toil.start(
                rootJob)
            toil.exportFile(primaryOutputAlignmentsFileID,
                            makeURL(self.simpleOutputCigarPath))
            toil.exportFile(secondaryOutputAlignmentsFileID,
                            makeURL(self.simpleOutputCigarPath2))

        # Check output
        with open(self.simpleOutputCigarPath, 'r') as fh:
            primaryOutputCigars = [cigar[:-1] for cigar in fh.readlines()
                                   ]  # Remove new lines

        with open(self.simpleOutputCigarPath2, 'r') as fh:
            secondaryOutputCigars = [cigar[:-1] for cigar in fh.readlines()
                                     ]  # Remove new lines

        return primaryOutputCigars + secondaryOutputCigars
Example #22
0
            def userScript():
                from toil.job import Job
                from toil.common import Toil

                # A user-defined type, i.e. a type defined in the user script
                class X(object):
                    pass

                # noinspection PyUnusedLocal
                def job(job, x, disk='10M', cores=1, memory='10M'):
                    return x

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args(
                    )
                    x = X()
                    with Toil(options) as toil:
                        r = toil.start(Job.wrapJobFn(job, x).encapsulate())
                    # Assert that the return value is of type X, but not X from the __main__
                    # module but X from foo.bar, the canonical name for the user module. The
                    # translation from __main__ to foo.bar is a side effect of hot-deployment.
                    assert r.__class__ is not X
                    import foo.bar
                    assert r.__class__ is foo.bar.X
                    # Assert that a copy was made. This is a side effect of pickling/unpickling.
                    assert x is not r
def augustus(args, coding_gp, toil_options):
    """
    Main entry function for Augustus toil pipeline
    :param args: dictionary of arguments from CAT
    :param coding_gp: genePred with only coding transcripts
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.tm_cfg = FileID.forPath(t.importFile('file://' + args.tm_cfg), args.tm_cfg)
            input_file_ids.coding_gp = FileID.forPath(t.importFile('file://' + coding_gp), coding_gp)
            input_file_ids.ref_psl = FileID.forPath(t.importFile('file://' + args.ref_psl), args.ref_psl)
            input_file_ids.tm_psl = FileID.forPath(t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            file_ids = [input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl,
                        input_file_ids.tm_psl, input_file_ids.annotation_gp]
            if args.augustus_tmr:
                input_file_ids.augustus_hints_db = FileID.forPath(t.importFile('file://' + args.augustus_hints_db),
                                                                  args.augustus_hints_db)
                input_file_ids.tmr_cfg = FileID.forPath(t.importFile('file://' + args.tmr_cfg), args.tmr_cfg)
                file_ids.append(args.augustus_hints_db)
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage)
            tm_file_id, tmr_file_id = t.start(job)
        else:
            tm_file_id, tmr_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_tm_gtf)
        t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf)
        if tmr_file_id is not None:
            tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf)
            t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
Example #24
0
 def start(self):
     with Toil(toil_options) as workflow:
         input_models = [InputModel(workflow, model) for model in self._models]
         job = BatchModelfit(input_models)
         result_models = workflow.start(job)
         for model in result_models:
             export_files(workflow, model.modelfit_results.tool_files)
Example #25
0
 def setUp(self):
     super(JobWrapperTest, self).setUp()
     self.jobStorePath = self._getTestJobStorePath()
     parser = ArgumentParser()
     Job.Runner.addToilOptions(parser)
     options = parser.parse_args(args=[self.jobStorePath])
     self.toil = Toil(options)
     self.assertEquals(self.toil, self.toil.__enter__())
Example #26
0
def hints_db(hints_args, toil_options):
    """
    Entry point for hints database Toil pipeline.
    """
    def validate_import_bam(t, bam_path, fasta_sequences, genome):
        validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
        return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
                FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]

    fasta = pyfasta.Fasta(hints_args.fasta)
    fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()}
    with Toil(toil_options) as t:
        if not t.options.restart:
            # load the RNA-seq data, if we have any
            bam_file_ids = {'BAM': {}, 'INTRONBAM': {}}
            for dtype in ['BAM', 'INTRONBAM']:
                if hints_args.genome not in hints_args.cfg[dtype]:
                    continue
                for bam_path in hints_args.cfg[dtype][hints_args.genome]:
                    bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path,
                                                                                          fasta_sequences,
                                                                                          hints_args.genome)

            # load the IsoSeq data, if we have any
            iso_seq_file_ids = []
            if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']:
                for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]:
                    validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome)
                    iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome))

            if hints_args.annotation_gp is None:
                annotation_file_id = None
            else:
                annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp),
                                                    hints_args.annotation_gp)
            if hints_args.protein_fasta is None:
                protein_fasta_file_id = genome_fasta_file_id = None
            else:
                protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta),
                                                       hints_args.protein_fasta)
                genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta)

            input_file_ids = {'bams': bam_file_ids,
                              'iso_seq_bams': iso_seq_file_ids,
                              'annotation': annotation_file_id,
                              'protein_fasta': protein_fasta_file_id,
                              'genome_fasta': genome_fasta_file_id}
            if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0:
                logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome))

            disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids)
            job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage)
            combined_hints = t.start(job)
        else:
            logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome))
            combined_hints = t.restart()
        tools.fileOps.ensure_file_dir(hints_args.hints_path)
        t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
Example #27
0
 def test_download_url(self):
     from toil_lib.urls import download_url
     A = Job.wrapJobFn(download_url,
                       work_dir=self.tmpdir,
                       url='www.google.com',
                       name='testy')
     with Toil(self.options) as toil:
         toil.start(A)
     assert os.path.exists(os.path.join(self.tmpdir, 'testy'))
Example #28
0
    def testExportAfterFailedExport(self):
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        try:
            with Toil(options) as toil:
                _ = toil.start(HelloWorld())
                # oh no, an error! :(
                raise RuntimeError("we died after workflow completion but before our export finished")
        except RuntimeError:
            pass

        options.restart = True
        with Toil(options) as toil:
            fileID = toil.restart()
            print fileID
            # Hopefully the error didn't cause us to lose all our work!
            toil.exportFile(fileID, 'file://' + self.exportPath)
        with open(self.exportPath) as f:
            # The file should have all our content
            self.assertEquals(f.read(), "Hello, World!")
Example #29
0
 def test_upload_and_download_with_encryption(self):
     from toil_lib.urls import s3am_upload
     from toil_lib.urls import download_url
     from boto.s3.connection import S3Connection, Bucket, Key
     # Create temporary encryption key
     key_path = os.path.join(self.tmpdir, 'foo.key')
     subprocess.check_call([
         'dd', 'if=/dev/urandom', 'bs=1', 'count=32',
         'of={}'.format(key_path)
     ])
     # Create test file
     upload_fpath = os.path.join(self.tmpdir, 'upload_file')
     with open(upload_fpath, 'wb') as fout:
         fout.write(os.urandom(1024))
     # Upload file
     random_key = os.path.join('test/', str(uuid4()), 'upload_file')
     s3_url = os.path.join('s3://cgl-driver-projects/', random_key)
     try:
         s3_dir = os.path.split(s3_url)[0]
         A = Job.wrapJobFn(s3am_upload,
                           fpath=upload_fpath,
                           s3_dir=s3_dir,
                           s3_key_path=key_path)
         with Toil(self.options) as toil:
             toil.start(A)
         # Download the file
         B = Job.wrapJobFn(download_url,
                           url=s3_url,
                           name='download_file',
                           work_dir=self.tmpdir,
                           s3_key_path=key_path)
         with Toil(self.options) as toil:
             toil.start(B)
         download_fpath = os.path.join(self.tmpdir, 'download_file')
         assert os.path.exists(download_fpath)
         assert filecmp.cmp(upload_fpath, download_fpath)
     finally:
         # Delete the Key. Key deletion never fails so we don't need to catch any exceptions
         with closing(S3Connection()) as conn:
             b = Bucket(conn, 'cgl-driver-projects')
             k = Key(b)
             k.key = random_key
             k.delete()
Example #30
0
def main():
    args = cli()
    samples = parse_manifest(args.manifest)

    # Start Toil run
    with Toil(args) as toil:
        if not toil.options.restart:
            toil.start(Job.wrapJobFn(map_job, run_outlier_model, samples, args))
        else:
            toil.restart()