Example #1
0
 def _deleteLocallyReadFilesFn(self, readAsMutable):
     self.options.retryCount = 0
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M')
     B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable,
                       memory='20M')
     A.addChild(B)
     Job.Runner.startToil(A, self.options)
Example #2
0
    def runNewCheckpointIsLeafVertexTest(self, createWorkflowFn):
        """
        Test verification that a checkpoint job is a leaf vertex using both
        valid and invalid cases.

        :param createWorkflowFn: function to create and new workflow and return a tuple of:

                                 0) the workflow root job
                                 1) a checkpoint job to test within the workflow

        """

        logger.info('Test checkpoint job that is a leaf vertex')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     expectedException=None)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a service')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobService=TrivialService("LeafTestService"),
                                     expectedException=JobGraphDeadlockException)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a child job')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobChild=Job.wrapJobFn(
                                         simpleJobFn, "LeafTestChild"),
                                     expectedException=JobGraphDeadlockException)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobFollowOn=Job.wrapJobFn(
                                         simpleJobFn,
                                         "LeafTestFollowOn"),
                                     expectedException=JobGraphDeadlockException)
Example #3
0
 def testCacheEjection(self):
     """
     Test cache always always ejects least recently created file
     """
     # Makes three jobs that create an output file each which they write to filestore.  The combined size of any two
     # files is always less that cacheSize but the combined size of all 3 is always more so 1 file always has to be
     # ejected. Test to ensure that A is always ejected regardless of size.
     #  Make a temp directory for the test
     test_dir = self._createTempDir()
     for test in xrange(10):
         options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "DEBUG"
         options.cacheSize = 100000
         options.retryCount=100
         options.badWorker=0.5
         options.badWorkerFailInterval = 1.0
         # Create a temp file to write teh test results
         handle, logfile = tempfile.mkstemp(dir=test_dir)
         os.close(handle)
         file_sizes = [50000, 40000, 30000]
         # Randomize to (potentially) test all combinations
         random.shuffle(file_sizes)
         # Run the workflow. A, B and C do teh cache operations, and D prints test status to tempFile
         A = Job.wrapJobFn(fileTestJob, file_sizes[0])
         B = Job.wrapJobFn(fileTestJob, file_sizes[0])
         C = Job.wrapJobFn(fileTestJob, file_sizes[0])
         D = Job.wrapJobFn(fileTestCache, A.rv(), B.rv(), C.rv(), logfile)
         A.addChild(B)
         B.addChild(C)
         C.addChild(D)
         Job.Runner.startToil(A, options)
         #  Assert jobs passed by reading test results from tempFile
         with open(logfile, 'r') as outfile:
             for test_status in outfile:
                 assert test_status.strip() == 'True'
Example #4
0
 def testPromiseRequirementRaceStatic(self):
     """
     Checks for a race condition when using promised requirements and child job functions.
     """
     A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024))
     B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv()))
     A.addChild(B)
     Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
Example #5
0
 def testReadCachHitFileFromJobStore(self):
     """
     Read a file from the file store that has a corresponding cached copy.  Ensure the number
     of links on the file are appropriate.
     """
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True)
     B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None,
                       fsID=A.rv())
     A.addChild(B)
     Job.Runner.startToil(A, self.options)
Example #6
0
        def _testCacheMissFunction(self, cacheReadFile):
            """
            This is the function that actually does what the 2 cache miss functions want.

            :param cacheReadFile: Does the read file need to be cached(T) or not(F)
            """
            workdir = self._createTempDir(purpose='nonLocalDir')
            A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=False, nonLocalDir=workdir)
            B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False,
                              cacheReadFile=cacheReadFile, fsID=A.rv())
            A.addChild(B)
            Job.Runner.startToil(A, self.options)
Example #7
0
 def _deleteLocallyReadFilesFn(self, readAsMutable):
     self.options.retryCount = 0
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M')
     B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M')
     A.addChild(B)
     try:
         Job.Runner.startToil(A, self.options)
     except FailedJobsException as err:
         self.assertEqual(err.numberOfFailedJobs, 2)
         errMsg = self._parseAssertionError(self.options.logFile)
         if 'explicitly' not in errMsg:
             self.fail('Shouldn\'t see this')
Example #8
0
 def testControlledFailedWorkerRetry(self):
     """
     Conduct a couple of job store operations.  Then die.  Ensure that the restarted job is
     tracking values in the cache state file appropriately.
     """
     workdir = self._createTempDir(purpose='nonLocalDir')
     self.options.retryCount = 1
     F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=2*1024*1024*1024, testDir=workdir,
                       disk='2G')
     G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M')
     F.addChild(G)
     Job.Runner.startToil(F, self.options)
Example #9
0
 def testToilIsNotBroken(self):
     """
     Runs a simple DAG to test if if any features other that caching were broken.
     """
     A = Job.wrapJobFn(self._uselessFunc)
     B = Job.wrapJobFn(self._uselessFunc)
     C = Job.wrapJobFn(self._uselessFunc)
     D = Job.wrapJobFn(self._uselessFunc)
     A.addChild(B)
     A.addChild(C)
     B.addChild(D)
     C.addChild(D)
     Job.Runner.startToil(A, self.options)
Example #10
0
 def test_star(self):
     """
     Test the functionality of align_dna
     """
     univ_options = self._getTestUnivOptions()
     config_file = os.path.join(self._projectRootPath(), "src/protect/test/test_inputs/ci_parameters.yaml")
     test_src_folder = os.path.join(self._projectRootPath(), "src", "protect", "test")
     a = Job.wrapJobFn(self._get_test_star_files)
     b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate()
     c = Job.wrapJobFn(self._get_tool, b.rv(), "star")
     d = Job.wrapJobFn(align_rna, a.rv(), univ_options, c.rv()).encapsulate()
     a.addChild(b)
     b.addChild(c)
     c.addChild(d)
     Job.Runner.startToil(a, self.options)
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
Example #12
0
    def test(self):
        """
        Tests that a toil workflow that fails once can be resumed without a NoSuchJobException.
        """
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.logLevel = "INFO"
        root = Job.wrapJobFn(parent)
        with self.assertRaises(FailedJobsException):
            # This one is intended to fail.
            Job.Runner.startToil(root, options)

        # Resume the workflow. Unfortunately, we have to check for
        # this bug using the logging output, since although the
        # NoSuchJobException causes the worker to fail, the batch
        # system code notices that the job has been deleted despite
        # the failure and avoids the failure.
        options.restart = True
        tempDir = self._createTempDir()
        options.logFile = os.path.join(tempDir, "log.txt")
        Job.Runner.startToil(root, options)
        with open(options.logFile) as f:
            logString = f.read()
            # We are looking for e.g. "Batch system is reporting that
            # the jobGraph with batch system ID: 1 and jobGraph
            # store ID: n/t/jobwbijqL failed with exit value 1"
            self.assertTrue("failed with exit value" not in logString)
Example #13
0
def main():
    parser = OptionParser()
    Job.Runner.addToilOptions(parser)
    
    parser.add_option("--fileToSort", dest="fileToSort",
                      help="The file you wish to sort")
    
    parser.add_option("--N", dest="N",
                      help="The threshold below which a serial sort function is"
                      "used to sort file. All lines must of length less than or equal to N or program will fail", 
                      default=10000)
    
    options, args = parser.parse_args()
    
    if options.fileToSort is None:
        raise RuntimeError("No file to sort given")

    if not os.path.exists(options.fileToSort):
        raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)
    
    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)
    
    if len(args) != 0:
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))
    
    #Now we are ready to run
    i = Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N)), options)
    
    if i:
        raise RuntimeError("The toil contained %i failed jobs" % i)
def main(args):

    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    options = parser.parse_args()

    if bool(options.master_ip) == bool(options.num_nodes):
        raise ValueError("Only one of --master_ip (%s) and --num_nodes (%d) can be provided." %
                         (options.master_ip, options.num_nodes))

    if options.num_nodes <= 1:
        raise ValueError("--num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater "
                         "than 1. %d was passed." % options.num_nodes)

    inputs = {'numWorkers': options.num_nodes - 1,
              'outDir':     options.output_directory,
              'bamName':    options.input_file_name,
              'knownSNPs':  options.known_SNPs,
              'driverMemory': options.driver_memory,
              'executorMemory': options.executor_memory,
              'sudo': options.sudo,
              'suffix': None,
              'masterIP': options.master_ip}

    Job.Runner.startToil(Job.wrapJobFn(static_adam_preprocessing_dag, inputs), options)
Example #15
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")
    robust_makedirs(json_out_path(options))
    robust_makedirs(compare_out_path(options))
                    
    # Make a root job
    root_job = Job.wrapJobFn(compute_all_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # make some tables from the json comparison output
    #dist_table(options)
    #acc_table(options)
    snp_count_table(options)
    graph_size_table(options)
def main():
    """
    This is a Toil pipeline used to perform variant analysis (usually on exomes) from Tumor/Normal BAMs.
    All samples are co-cleaned (GATK Indel Realignment (IR) and Base Quality Score Recalibration (BQSR))
    before variant analysis is performed by MuTect.  The final output of this pipeline is a tarball
    containing the output of MuTect (.vcf, .cov, .out).

    Please see the associated README.md for an overview and quickstart walkthrough.
    """
    # Define Parser object and add to jobTree
    argparser = build_parser()
    Job.Runner.addToilOptions(argparser)
    pargs = argparser.parse_args()
    # Variables to pass to initial job
    inputs = {
        "ref.fasta": pargs.reference,
        "config": pargs.config,
        "phase.vcf": pargs.phase,
        "mills.vcf": pargs.mills,
        "dbsnp.vcf": pargs.dbsnp,
        "cosmic.vcf": pargs.cosmic,
        "output_dir": pargs.output_dir,
        "ssec": pargs.ssec,
        "s3_dir": pargs.s3_dir,
        "sudo": pargs.sudo,
        "uuid": None,
        "normal.bam": None,
        "tumor.bam": None,
        "cpu_count": None,
    }

    # Launch Pipeline
    Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), pargs)
def align_transcripts(args, toil_options):
    """
    Main entry function for transcript alignment toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta)
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path)
            input_file_ids.modes = {}
            file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp,
                        input_file_ids.ref_db]
            for mode in args.transcript_modes:
                input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp'])
                file_ids.append(input_file_ids.modes[mode])
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage)
            results_file_ids = t.start(job)
        else:
            results_file_ids = t.restart()
        for file_path, file_id in results_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(file_path)
            t.exportFile(file_id, 'file://' + file_path)
 def test_mhc_assessment(self):
     """
     Test the functionality of assess_mhc_genes
     """
     univ_options = self._getTestUnivOptions()
     test_src_folder = os.path.join(self._projectRootPath(), 'src', 'protect', 'test')
     a = Job.wrapJobFn(self._get_test_rsem_file, test_src_folder)
     b = Job.wrapJobFn(self._get_MHC_file)
     c = Job.wrapJobFn(self._get_test_haplotype_file, test_src_folder)
     d = Job.wrapJobFn(assess_mhc_genes, a.rv(), c.rv(), univ_options, b.rv())
     e = Job.wrapJobFn(self._test_output, d.rv(), univ_options)
     a.addChild(b)
     b.addChild(c)
     c.addChild(d)
     d.addChild(e)
     Job.Runner.startToil(a, self.options)
def main():
    """
    This is a Toil pipeline to transfer TCGA data into an S3 Bucket

    Data is pulled down with Genetorrent and transferred to S3 via S3AM.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'genetorrent': args.genetorrent,
              'genetorrent_key': args.genetorrent_key,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'sudo': args.sudo}
    # Sanity checks
    if args.ssec:
        assert os.path.isfile(args.ssec)
    if args.genetorrent:
        assert os.path.isfile(args.genetorrent)
    if args.genetorrent_key:
        assert os.path.isfile(args.genetorrent_key)
    # Start Pipeline
    Job.Runner.startToil(Job.wrapJobFn(start_batch, inputs), args)
Example #20
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)

    parser.add_argument('--num-lines', default=1000, help='Number of lines in file to sort.', type=int)
    parser.add_argument('--line-length', default=50, help='Length of lines in file to sort.', type=int)
    parser.add_argument("--N",
                        help="The threshold below which a serial sort function is used to sort file. "
                        "All lines must of length less than or equal to N or program will fail",
                        default=10000)

    options = parser.parse_args()

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    file_name = 'file_to_sort.txt'
    make_file_to_sort(file_name=file_name, lines=options.num_lines, line_length=options.line_length)

    with Toil(options) as toil:
        sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt')
        if not toil.options.restart:
            sort_file_id = toil.importFile(sort_file_url)
            sorted_file_id = toil.start(Job.wrapJobFn(setup, sort_file_id, int(options.N), False, memory='1000M'))
        else:
            sorted_file_id = toil.restart()
        toil.exportFile(sorted_file_id, sort_file_url)
Example #21
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for graph in options.graphs:
        if os.path.splitext(graph)[1] != ".vg":
            raise RuntimeError("Input graphs expected to have .vg extension")

    # Make a root job
    root_job = Job.wrapJobFn(compute_kmer_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # Do the drawing outside toil to get around weird import problems
    cluster_comparisons(options)
def main():
    """
    This is a Toil pipeline for the UNC best practice RNA-Seq analysis.
    RNA-seq fastqs are combined, aligned, sorted, filtered, and quantified.

    Please read the README.md located in the same directory.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'config': args.config,
              'config_fastq': args.config_fastq,
              'input': args.input,
              'unc.bed': args.unc,
              'hg19.transcripts.fa': args.fasta,
              'composite_exons.bed': args.composite_exons,
              'normalize.pl': args.normalize,
              'output_dir': args.output_dir,
              'rsem_ref.zip': args.rsem_ref,
              'chromosomes.zip': args.chromosomes,
              'ebwt.zip': args.ebwt,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'sudo': args.sudo,
              'single_end_reads': args.single_end_reads,
              'upload_bam_to_s3': args.upload_bam_to_s3,
              'uuid': None,
              'sample.tar': None,
              'cpu_count': None}

    # Launch jobs
    Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), args)
Example #23
0
 def testJobFileStoreWithSmallCache(self, retryCount=0, badWorker=0.0, 
                      stringNo=1, stringLength=1000000, cacheSize=10000, testNo=2):
     """
     Creates a chain of jobs, each reading and writing files using the 
     Job.FileStore interface. Verifies the files written are always what we expect.
     The chain tests the caching behavior. 
     """
     for test in xrange(testNo):
         #Make a list of random strings, each of 100k chars and hash the first 200 
         #base prefix to the string
         def randomString():
             chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             s = "".join(map(lambda i : random.choice(chars), xrange(stringLength)))
             return s[:PREFIX_LENGTH], s
         #Total length is 2 million characters (20 strings of length 100K each) 
         testStrings = dict(map(lambda i : randomString(), xrange(stringNo)))
         options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "INFO"
         options.cacheSize = cacheSize
         options.retryCount=retryCount
         options.badWorker=badWorker
         options.badWorkerFailInterval = 1.0
         chainLength = 10
         # Run the workflow, the return value being the number of failed jobs
         Job.Runner.startToil(Job.wrapJobFn(fileTestJob, [], 
                                            testStrings, chainLength), 
                              options)
def main():
    """
    This is a Toil pipeline to transfer TCGA data into an S3 Bucket

    Data is pulled down with Genetorrent and transferred to S3 via S3AM.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'genetorrent': args.genetorrent,
              'genetorrent_key': args.genetorrent_key,
              'ssec': args.ssec,
              's3_dir': args.s3_dir}
    # Sanity checks
    if args.ssec:
        assert os.path.isfile(args.ssec)
    if args.genetorrent:
        assert os.path.isfile(args.genetorrent)
    if args.genetorrent_key:
        assert os.path.isfile(args.genetorrent_key)
    samples = parse_genetorrent(args.genetorrent)
    # Start pipeline
    # map_job accepts a function, an iterable, and *args. The function is launched as a child
    # process with one element from the iterable and *args, which in turn spawns a tree of child jobs.
    Job.Runner.startToil(Job.wrapJobFn(map_job, download_and_transfer_sample, samples, inputs), args)
Example #25
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)

    parser.add_argument("--fileToSort", dest="fileToSort",
                      help="The file you wish to sort")

    parser.add_argument("--N", dest="N",
                      help="The threshold below which a serial sort function is"
                      "used to sort file. All lines must of length less than or equal to N or program will fail",
                      default=10000)

    options = parser.parse_args()

    if options.fileToSort is None:
        raise RuntimeError("No file to sort given")

    if not os.path.exists(options.fileToSort):
        raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    #Now we are ready to run
    Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N),
                                       memory=sortMemory), options)
Example #26
0
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """
    
    if len(args) == 2 and args[1] == "--test":
        # Run the tests
        return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
    
    options = parse_args(args) # This holds the nicely-parsed options object
    
    RealTimeLogger.start_master()
    
    # Make a root job
    root_job = Job.wrapJobFn(collate_all, options,
        cores=1, memory="1G", disk="1G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
        
    print("All jobs completed successfully")
    
    RealTimeLogger.stop_master()
Example #27
0
            def userScript():
                from toil.job import Job
                from toil.common import Toil

                # A user-defined type, i.e. a type defined in the user script
                class X(object):
                    pass

                # noinspection PyUnusedLocal
                def job(job, x, disk='10M', cores=1, memory='10M'):
                    return x

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args()
                    x = X()
                    with Toil(options) as toil:
                        r = toil.start(Job.wrapJobFn(job, x).encapsulate())
                    # Assert that the return value is of type X, but not X from the __main__
                    # module but X from foo.bar, the canonical name for the user module. The
                    # translation from __main__ to foo.bar is a side effect of hot-deployment.
                    assert r.__class__ is not X
                    import foo.bar
                    assert r.__class__ is foo.bar.X
                    # Assert that a copy was made. This is a side effect of pickling/unpickling.
                    assert x is not r
Example #28
0
 def testEncapsulation(self):
     """
     Tests the Job.encapsulation method, which uses the EncapsulationJob
     class.
     """
     # Temporary file
     outFile = getTempFile(rootDir=self._createTempDir())
     try:
         # Encapsulate a job graph
         a = T.wrapJobFn(encapsulatedJobFn, "A", outFile)
         a = a.encapsulate()
         # Now add children/follow to the encapsulated graph
         d = T.wrapFn(f, a.rv(), outFile)
         e = T.wrapFn(f, d.rv(), outFile)
         a.addChild(d)
         a.addFollowOn(e)
         # Create the runner for the workflow.
         options = T.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "INFO"
         # Run the workflow, the return value being the number of failed jobs
         T.Runner.startToil(a, options)
         # Check output
         self.assertEquals(open(outFile, 'r').readline(), "ABCDE")
     finally:
         os.remove(outFile)
Example #29
0
 def testWriteLocalFileToJobStore(self):
     """
     Write a file from the localTempDir to the job store.  Such a file will be cached by
     default.  Ensure the file is cached.
     """
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True)
     Job.Runner.startToil(A, self.options)
Example #30
0
 def testDeleteLocalFile(self):
     """
     Test the deletion capabilities of deleteLocalFile
     """
     self.options.retryCount = 0
     workdir = self._createTempDir(purpose='nonLocalDir')
     A = Job.wrapJobFn(self._deleteLocalFileFn, nonLocalDir=workdir)
     Job.Runner.startToil(A, self.options)
Example #31
0
 def testTrivialDAGConsistency(self):
     options = Job.Runner.getDefaultOptions(self._createTempDir() +
                                            '/jobStore')
     options.clean = 'always'
     options.logLevel = 'debug'
     i = Job.wrapJobFn(trivialParent)
     with Toil(options) as toil:
         try:
             toil.start(i)
         except FailedJobsException:
             # we expect this exception to be raised
             pass
         else:
             self.fail()
def augustus(args, coding_gp, toil_options):
    """
    Main entry function for Augustus toil pipeline
    :param args: dictionary of arguments from CAT
    :param coding_gp: genePred with only coding transcripts
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(
                t, args.genome_fasta)
            input_file_ids.tm_cfg = FileID.forPath(
                t.importFile('file://' + args.tm_cfg), args.tm_cfg)
            input_file_ids.coding_gp = FileID.forPath(
                t.importFile('file://' + coding_gp), coding_gp)
            input_file_ids.ref_psl = FileID.forPath(
                t.importFile('file://' + args.ref_psl), args.ref_psl)
            input_file_ids.tm_psl = FileID.forPath(
                t.importFile('file://' + args.filtered_tm_psl),
                args.filtered_tm_psl)
            input_file_ids.annotation_gp = FileID.forPath(
                t.importFile('file://' + args.annotation_gp),
                args.annotation_gp)
            file_ids = [
                input_file_ids.genome_fasta, input_file_ids.coding_gp,
                input_file_ids.ref_psl, input_file_ids.tm_psl,
                input_file_ids.annotation_gp
            ]
            if args.augustus_tmr:
                input_file_ids.augustus_hints_db = FileID.forPath(
                    t.importFile('file://' + args.augustus_hints_db),
                    args.augustus_hints_db)
                input_file_ids.tmr_cfg = FileID.forPath(
                    t.importFile('file://' + args.tmr_cfg), args.tmr_cfg)
                file_ids.append(args.augustus_hints_db)
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup,
                                args,
                                input_file_ids,
                                disk_usage,
                                disk=disk_usage)
            tm_file_id, tmr_file_id = t.start(job)
        else:
            tm_file_id, tmr_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_tm_gtf)
        t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf)
        if tmr_file_id is not None:
            tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf)
            t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument('--reference', required=True)
    parser.add_argument('--target', required=True)
    parser.add_argument('--chunk-size', default=500, type=int)
    parser.add_argument('--out-psl', required=True)
    parser.add_argument('--ooc')
    args = parser.parse_args()
    r = Job.Runner.startToil(Job.wrapJobFn(setup, os.path.abspath(args.reference),
                                           os.path.abspath(args.target), args.chunk_size, args.ooc,
                         memory='4G'), args)
    with open(args.out_psl, 'w') as outf:
        outf.write(r)
Example #34
0
        def _testMultipleJobsReadGlobalFileFunction(self, cacheHit):
            """
            This function does what the two Multiple File reading tests want to do

            :param bool cacheHit: Is the test for the CacheHit case(T) or cacheMiss case(F)
            """
            dirPurpose = 'tempWriteDir' if cacheHit else 'nonLocalDir'
            workdir = self._createTempDir(purpose=dirPurpose)
            with open(os.path.join(workdir, 'test'), 'w') as x:
                x.write(str(0))
            A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=cacheHit, nonLocalDir=workdir,
                              fileMB=256)
            B = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M')
            jobs = {}
            for i in xrange(0, 10):
                jobs[i] = Job.wrapJobFn(self._multipleFileReader, diskMB=1024, fsID=A.rv(),
                                        maxWriteFile=os.path.abspath(x.name), disk='1G',
                                        memory='10M', cores=1)
                A.addChild(jobs[i])
                jobs[i].addChild(B)
            Job.Runner.startToil(A, self.options)
            with open(x.name, 'r') as y:
                assert int(y.read()) > 2
Example #35
0
        def testConcurrencyDynamic(self):
            """
            Asserts that promised core resources are allocated properly using a dynamic Toil workflow
            """
            for coresPerJob in self.allocatedCores:
                log.debug('Testing %d cores per job with CPU count %d', coresPerJob, self.cpuCount)
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job.wrapJobFn(maxConcurrency, self.cpuCount, counterPath, coresPerJob,
                                     cores=1, memory='1M', disk='1M')
                values = Job.Runner.startToil(root, self.getOptions(tempDir))
                maxValue = max(values)
                self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
Example #36
0
                def root(rootJob):
                    def nullFile():
                        return rootJob.fileStore.jobStore.importFile(
                            'file:///dev/null')

                    startFile = nullFile()
                    endFile = nullFile()
                    rootJob.addChildJobFn(deferring, startFile, endFile)
                    encapsulatedJob = Job.wrapJobFn(encapsulated, startFile)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatingJob = encapsulatedJob.encapsulate()
                    rootJob.addChild(encapsulatingJob)
                    encapsulatingJob.addChildJobFn(last, endFile)
Example #37
0
            def userScript():
                from toil.common import Toil
                from toil.job import Job

                def root(rootJob):
                    def nullFile():
                        return rootJob.fileStore.jobStore.import_file(
                            'file:///dev/null')

                    startFile = nullFile()
                    endFile = nullFile()
                    rootJob.addChildJobFn(deferring, startFile, endFile)
                    encapsulatedJob = Job.wrapJobFn(encapsulated, startFile)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatingJob = encapsulatedJob.encapsulate()
                    rootJob.addChild(encapsulatingJob)
                    encapsulatingJob.addChildJobFn(last, endFile)

                def dummy():
                    pass

                def deferred():
                    pass

                # noinspection PyUnusedLocal
                def deferring(job, startFile, endFile):
                    job.defer(deferred)
                    job.fileStore.jobStore.delete_file(startFile)
                    timeout = time.time() + 10
                    while job.fileStore.jobStore.file_exists(endFile):
                        assert time.time() < timeout
                        time.sleep(1)

                def encapsulated(job, startFile):
                    timeout = time.time() + 10
                    while job.fileStore.jobStore.file_exists(startFile):
                        assert time.time() < timeout
                        time.sleep(1)

                def last(job, endFile):
                    job.fileStore.jobStore.delete_file(endFile)

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args(
                    )
                    with Toil(options) as toil:
                        rootJob = Job.wrapJobFn(root)
                        toil.start(rootJob)
Example #38
0
            def userScript():
                from toil.job import Job
                from toil.common import Toil

                # noinspection PyUnusedLocal
                def job(job, disk='10M', cores=1, memory='10M'):
                    assert False

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args()
                    with Toil(options) as toil:
                        if toil.config.restart:
                            toil.restart()
                        else:
                            toil.start(Job.wrapJobFn(job))
Example #39
0
        def restartScript():
            from toil.job import Job
            import argparse
            import os

            def f0(job):
                if 'FAIL' in os.environ:
                    raise RuntimeError('failed on purpose')

            if __name__ == '__main__':
                parser = argparse.ArgumentParser()
                Job.Runner.addToilOptions(parser)
                options = parser.parse_args()
                rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M')
                Job.Runner.startToil(rootJob, options)
Example #40
0
        def testPromisesWithJobStoreFileObjects(self, caching=True):
            """
            Check whether FileID objects are being pickled properly when used as return
            values of functions.  Then ensure that lambdas of promised FileID objects can be
            used to describe the requirements of a subsequent job.  This type of operation will be
            used commonly in Toil scripts.
            :return: None
            """
            file1 = 1024
            file2 = 512
            F1 = Job.wrapJobFn(_writer, file1)
            F2 = Job.wrapJobFn(_writer, file2)
            G = Job.wrapJobFn(_follower,
                              file1 + file2,
                              disk=PromisedRequirement(
                                  lambda x, y: x.size + y.size, F1.rv(),
                                  F2.rv()))
            F1.addChild(F2)
            F2.addChild(G)

            Job.Runner.startToil(
                F1,
                self.getOptions(self._createTempDir('testFiles'),
                                caching=caching))
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences = [], maskAlpha=False, clipAlpha=None,
                  maskPAF=None, inputEventNames=None, brnnCores=None):
    #Replace any constants
    configNode = ET.parse(configFile).getroot()
    if not outputSequences:
        outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir)
    else:
        assert len(outputSequences) == len(inputSequences)

    # Make sure we have the dna-brnn model in the filestore if we need it
    loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha = maskAlpha)
        
    if configNode.find("constants") != None:
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals()
    if maskAlpha or clipAlpha:
        ConfigWrapper(configNode).setPreprocessorActive("lastzRepeatMask", False)
        ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True)
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                if clipAlpha:
                    node.attrib["action"] = "clip"                    
    if brnnCores is not None:
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                node.attrib["cpu"] = brnnCores
        
    if not restart:
        inputSequenceIDs = []
        for seq in inputSequences:
            logger.info("Importing {}".format(seq))
            inputSequenceIDs.append(toil.importFile(makeURL(seq)))
        if maskPAF:
            inputPAFID = toil.importFile(makeURL(maskPAF))
        else:
            inputPAFID = None
        unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences, inputSequenceIDs, inputEventNames, maskPAF, inputPAFID)
        outputSequenceIDs = toil.start(unzip_job)
    else:
        outputSequenceIDs = toil.restart()
    for seqID, path in zip(outputSequenceIDs, outputSequences):
        try:
            iter(seqID)
            # dna-brnn will output a couple of bed files.  we scrape those out here
            toil.exportFile(seqID[0], makeURL(path))
            toil.exportFile(seqID[1], makeURL(path) + '.bed')
            toil.exportFile(seqID[2], makeURL(path) + '.mask.bed')
        except:
            toil.exportFile(seqID, makeURL(path))
Example #42
0
 def _runAndReturnWorkDir(self, cleanWorkDir, job, expectError=False):
     """
     Runs toil with the specified job and cleanWorkDir setting. expectError determines whether the test's toil
     run is expected to succeed, and the test will fail if that expectation is not met. returns the contents of
     the workDir after completion of the run
     """
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.workDir = self.testDir
     options.clean = "always"
     options.cleanWorkDir = cleanWorkDir
     A = Job.wrapJobFn(job)
     if expectError:
         self._launchError(A, options)
     else:
         self._launchRegular(A, options)
     return os.listdir(self.testDir)
Example #43
0
 def testReturnFileSizesWithBadWorker(self):
     """
     Write a couple of files to the jobstore.  Delete a couple of them.  Read back written
     and locally deleted files.  Ensure that after every step that the cache state file is
     describing the correct values.
     """
     self.options.retryCount = 20
     self.options.badWorker = 0.5
     self.options.badWorkerFailInterval = 0.1
     workdir = self._createTempDir(purpose='nonLocalDir')
     F = Job.wrapJobFn(self._returnFileTestFn,
                       jobDisk=2 * 1024 * 1024 * 1024,
                       initialCachedSize=0,
                       nonLocalDir=workdir,
                       numIters=30, disk='2G')
     Job.Runner.startToil(F, self.options)
Example #44
0
 def testImportLinking(self):
     """
     importFile will link instead of copying to jobStore in ``--linkImports`` option is specified.
     we want to test this behavior
     """
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.linkImports = True
     fileName = 'dummyFile.txt'
     with open(fileName, 'w') as fh:
         fh.write('Subtle literature reference.')
     with Toil(options) as workflow:
         fileID = workflow.importFile('file://' + os.path.abspath(fileName))
         workflow.start(
             Job.wrapJobFn(compareiNodes, fileID,
                           os.path.abspath(fileName)))
     os.remove(fileName)
Example #45
0
 def testDockerPipeChain(self, caching=True):
     """
     Test for piping API for dockerCall().  Using this API (activated when list of
     argument lists is given as parameters), commands a piped together into a chain
     ex:  parameters=[ ['printf', 'x\n y\n'], ['wc', '-l'] ] should execute:
     printf 'x\n y\n' | wc -l
     """
     options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, 'jobstore'))
     options.logLevel = 'INFO'
     options.workDir = self.tempDir
     options.clean = 'always'
     if not caching:
         options.disableCaching = True
     A = Job.wrapJobFn(_testDockerPipeChainFn)
     rv = Job.Runner.startToil(A, options)
     assert rv.strip() == '2'
Example #46
0
 def testDockerPipeChainErrorDetection(self, disableCaching=True):
     """
     By default, executing cmd1 | cmd2 | ... | cmdN, will only return an
     error if cmdN fails.  This can lead to all manor of errors being
     silently missed.  This tests to make sure that the piping API for
     dockerCall() throws an exception if non-last commands in the chain fail.
     """
     options = Job.Runner.getDefaultOptions(
         os.path.join(self.tempDir, 'jobstore'))
     options.logLevel = self.dockerTestLogLevel
     options.workDir = self.tempDir
     options.clean = 'always'
     options.caching = disableCaching
     A = Job.wrapJobFn(_testDockerPipeChainErrorFn)
     rv = Job.Runner.startToil(A, options)
     assert rv == True
def main():
    """
    This is the main function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--sample_groups',
                        '-S',
                        dest='sample_groups',
                        help='sample_groups.dill',
                        type=str,
                        required=True)
    parser.add_argument('--creds',
                        '-C',
                        dest='creds',
                        help='GDC token file.',
                        type=str,
                        required=True)
    parser.add_argument('--output_folder',
                        '-O',
                        dest='output_folder',
                        help='Output folder.',
                        type=str,
                        required=False,
                        default='outputs')
    parser.add_argument('--input_folder',
                        '-I',
                        dest='input_folder',
                        help='Input folder.',
                        type=str,
                        required=False,
                        default=os.getcwd())
    Job.Runner.addToilOptions(parser)
    params = parser.parse_args()

    params.sample_groups = os.path.abspath(params.sample_groups)
    params.creds = os.path.abspath(params.creds)
    params.output_folder = os.path.abspath(params.output_folder)
    params.input_folder = os.path.abspath(params.input_folder)

    start = Job.wrapJobFn(launchpad,
                          params.sample_groups,
                          params.input_folder,
                          params.output_folder,
                          params.creds,
                          cores=1)
    Job.Runner.startToil(start, params)
    return None
Example #48
0
def main():
    """
    This is a Toil pipeline to transfer TCGA data into an S3 Bucket

    Data is pulled down with Genetorrent and transferred to S3 via S3AM.
    """

    # Define Parser object and add to toil
    def existing_file(fname):
        """
        Argparse type for an existing file
        """
        if not os.path.isfile(fname):
            raise ValueError("Invalid file: " + str(fname))
        return fname

    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '--sudo',
        dest='sudo',
        default=None,
        action='store_true',
        help=
        'Docker usually needs sudo to execute locally, but not when running Mesos or when '
        'the user is a member of a Docker group.')
    Job.Runner.addToilOptions(parser)
    parser.add_argument('datafiles',
                        nargs='+',
                        help='FASTA input',
                        type=existing_file)

    args = parser.parse_args()

    assert args.jobStore is not None
    config = Config()
    config.setOptions(args)

    # Store inputs from argparse
    inputs = {'sudo': args.sudo}
    datafiles = [os.path.abspath(d) for d in args.datafiles]
    # Start Pipeline
    options = Job.Runner.getDefaultOptions("./toilWorkflow")

    Job.Runner.startToil(Job.wrapJobFn(start_batch, datafiles, inputs),
                         options)
Example #49
0
def main():
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    
    parser.add_argument('--minSleep', type=int, default=1,
        help="Minimum seconds to sleep")
    
    Job.Runner.addToilOptions(parser)
    
    options = parser.parse_args(sys.argv[1:])

    root_job = Job.wrapJobFn(root, options)

    with Toil(options) as toil:
        results = toil.start(root_job)
        
    print("Caching results:")
    print(results)
Example #50
0
 def testSiblingDAGConsistency(self):
     """
     Slightly more complex case. The stranded job's predecessors are siblings instead of
     parent/child.
     """
     options = Job.Runner.getDefaultOptions(self._createTempDir() + '/jobStore')
     options.clean = 'always'
     options.logLevel = 'debug'
     i = Job.wrapJobFn(diamond)
     with Toil(options) as toil:
         try:
             toil.start(i)
         except FailedJobsException:
             # we expect this exception to be raised
             pass
         else:
             self.fail()
Example #51
0
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """

    options = parse_args(args)  # This holds the nicely-parsed options object

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    logging.info("Running on Toil from {}".format(toil.__file__))

    # Add the drunner to the options and initialize stuff from the Toil VG
    # config
    toilvgfacade.initialize(options)

    # Start up Toil
    with Toil(options) as toil_instance:

        if toil_instance.options.restart:
            # We're re-running. Grab the root job return value from restart
            directory = toil_instance.restart()
        else:
            # Run from the top

            # Don't import on the master. Let the nodes handle the download.

            # Make a root job
            root_job = Job.wrapJobFn(main_job,
                                     options,
                                     options.sam_url,
                                     cores=1,
                                     memory="1G",
                                     disk="1G")

            # Run the root job and get the final output directory
            directory = toil_instance.start(root_job)

        # Export the results
        directory.export_to(lambda id, url: toil_instance.exportFile(id, url),
                            options.out_url)

    print("Toil workflow complete")
    return 0
Example #52
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    # Make a root job
    root_job = Job.wrapJobFn(run_and_evaluate, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and get the return value
    answer = Job.Runner.startToil(root_job,  options)

    RealTimeLogger.stop_master()
    
    print("Root return value:")
    print(answer)
Example #53
0
        def testPromisedRequirementDynamic(self):
            """
            Asserts that promised core resources are allocated properly using a dynamic Toil workflow
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job.wrapJobFn(maxConcurrency,
                                     self.cpuCount,
                                     counterPath,
                                     coresPerJob,
                                     cores=1,
                                     memory='1M',
                                     disk='1M')
                values = Job.Runner.startToil(root, self.getOptions(tempDir))
                maxValue = max(values)
                self.assertEqual(maxValue, self.cpuCount / coresPerJob)
Example #54
0
    def wordCount(self,
                  badWorker=0.0,
                  badWorkerFailInterval=0.05,
                  checkpoint = True):

        # make workdir
        workDir = tempfile.mkdtemp()
        os.rmdir(workDir)

        # wrap _count as a job
        countJob = Job.wrapJobFn(_count, 1, checkpoint = checkpoint)
        options = Job.Runner.getDefaultOptions(workDir)
        options.batchSystem = 'singleMachine'
        options.badWorker = badWorker
        options.badWorkerFailInterval = badWorkerFailInterval
        options.clean = 'never'

        Job.Runner.startToil(countJob, options)
Example #55
0
    def testService(self, checkpoint=False):
        """
        Tests the creation of a Job.Service with random failures of the worker.
        """
        for test in range(2):
            outFile = getTempFile(rootDir=self._createTempDir()) # Temporary file
            messageInt = random.randint(1, sys.maxsize)
            try:
                # Wire up the services/jobs
                t = Job.wrapJobFn(serviceTest, outFile, messageInt, checkpoint=checkpoint)

                # Run the workflow repeatedly until success
                self.runToil(t)

                # Check output
                self.assertEqual(int(open(outFile, 'r').readline()), messageInt)
            finally:
                os.remove(outFile)
Example #56
0
def main():
    """
    This is the main function for ProTECT.
    """
    parser = argparse.ArgumentParser(prog='ProTECT',
                                     description='Prediction of T-Cell Epitopes for Cancer Therapy',
                                     epilog='Contact Arjun Rao ([email protected]) if you encounter '
                                     'any problems while running ProTECT')
    inputs = parser.add_mutually_exclusive_group(required=True)
    inputs.add_argument('--config_file', dest='config_file', help='Config file to be used in the '
                        'run.', type=str, default=None)
    inputs.add_argument('--generate_config', dest='generate_config', help='Generate a config file '
                        'in the current directory that is pre-filled with references and flags for '
                        'an hg19 run.', action='store_true', default=False)
    parser.add_argument('--max-cores-per-job', dest='max_cores', help='Maximum cores to use per '
                        'job. Aligners and Haplotypers ask for cores dependent on the machine that '
                        'the launchpad gets assigned to -- In a heterogeneous cluster, this can '
                        'lead to problems. This value should be set to the number of cpus on the '
                        'smallest node in a cluster.',
                        type=int, required=False, default=None)
    # We parse the args once to see if the user has asked for a config file to be generated.  In
    # this case, we don't need a jobstore.  To handle the case where Toil arguments are passed to
    # ProTECT, we parse known args, and if the used specified config_file instead of generate_config
    # we re-parse the arguments with the added Toil parser.
    params, others = parser.parse_known_args()
    if params.generate_config:
        generate_config_file()
    else:
        Job.Runner.addToilOptions(parser)
        params = parser.parse_args()
        params.config_file = os.path.abspath(params.config_file)
        if params.maxCores:
            if not params.max_cores:
                params.max_cores = int(params.maxCores)
            else:
                if params.max_cores > int(params.maxCores):
                    print("The value provided to max-cores-per-job (%s) was greater than that "
                          "provided to maxCores (%s). Setting max-cores-per-job = maxCores." %
                          (params.max_cores, params.maxCores), file=sys.stderr)
                    params.max_cores = int(params.maxCores)
        start = Job.wrapJobFn(parse_config_file, params.config_file, params.max_cores)
        Job.Runner.startToil(start, params)
    return None
Example #57
0
 def testDockerPipeChain(self, disableCaching=True):
     """
     Test for piping API for dockerCall().  Using this API (activated when
     list of argument lists is given as parameters), commands a piped
     together into a chain.
     ex:  parameters=[ ['printf', 'x\n y\n'], ['wc', '-l'] ] should execute:
     printf 'x\n y\n' | wc -l
     """
     options = Job.Runner.getDefaultOptions(
         os.path.join(self.tempDir, 'jobstore'))
     options.logLevel = self.dockerTestLogLevel
     options.workDir = self.tempDir
     options.clean = 'always'
     options.caching = disableCaching
     A = Job.wrapJobFn(_testDockerPipeChainFn)
     rv = Job.Runner.startToil(A, options)
     logger.info('Container pipeline result: %s', repr(rv))
     rv = rv.decode('utf-8')
     assert rv.strip() == '2'
Example #58
0
            def userScript():
                from toil.job import Job
                from toil.common import Toil
                # noinspection PyUnresolvedReferences
                from toil_lib.foo import libraryJob

                # noinspection PyUnusedLocal
                def job(job, disk='10M', cores=1, memory='10M'):
                    # Double the requirements to prevent chaining as chaining might hide problems
                    # in hot deployment code.
                    job.addChildJobFn(libraryJob, disk='20M', cores=cores, memory=memory)

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args()
                    with Toil(options) as toil:
                        if toil.config.restart:
                            toil.restart()
                        else:
                            toil.start(Job.wrapJobFn(job))
def chaining(args, toil_options):
    """entry point to this program"""
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
            input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
            input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit),
                                                          args.query_two_bit)
            target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f)
                                       for genome, f in args.target_two_bits.iteritems()}
            input_file_ids.target_two_bits = target_two_bit_file_ids
            job = Job.wrapJobFn(setup, args, input_file_ids)
            chain_file_ids = t.start(job)
        else:
            chain_file_ids = t.restart()
        for chain_file, chain_file_id in chain_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(chain_file)
            t.exportFile(chain_file_id, 'file://' + chain_file)
Example #60
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)

    parser.add_argument('--num-lines',
                        default=1000,
                        help='Number of lines in file to sort.',
                        type=int)
    parser.add_argument('--line-length',
                        default=50,
                        help='Length of lines in file to sort.',
                        type=int)
    parser.add_argument(
        "--N",
        help=
        "The threshold below which a serial sort function is used to sort file. "
        "All lines must of length less than or equal to N or program will fail",
        default=10000)

    options = parser.parse_args()

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    file_name = 'file_to_sort.txt'
    make_file_to_sort(file_name=file_name,
                      lines=options.num_lines,
                      line_length=options.line_length)

    with Toil(options) as toil:
        if not toil.options.restart:
            sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt')
            sort_file_id = toil.importFile(sort_file_url)
            sorted_file_id = toil.start(
                Job.wrapJobFn(setup,
                              sort_file_id,
                              int(options.N),
                              False,
                              memory='1000M'))
            toil.exportFile(sorted_file_id, sort_file_url)
        else:
            toil.restart()