def __init__(self, tree, event, sleepTime, startTime, cpu):
     Job.__init__(self, cpu=cpu)
     self.tree = tree
     self.event = event
     self.sleepTime = sleepTime
     self.startTime = startTime
     self.cpu = cpu
Example #2
0
        def testJobConcurrency(self):
            """
            Tests that the batch system is allocating core resources properly for concurrent tasks.
            """
            for cores_per_job in self.allocated_cores:
                temp_dir = self._createTempDir('testFiles')

                options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
                options.workDir = temp_dir
                options.maxCores = self.cpu_count
                options.batchSystem = self.batchSystemName

                counter_path = os.path.join(temp_dir, 'counter')
                resetCounters(counter_path)
                value, max_value = getCounters(counter_path)
                assert (value, max_value) == (0, 0)

                root = Job()
                for _ in range(self.cpu_count):
                    root.addFollowOn(Job.wrapFn(measureConcurrency, counter_path, self.sleep_time,
                                                cores=cores_per_job, memory='1M', disk='1Mi'))
                Job.Runner.startToil(root, options)

                _, max_value = getCounters(counter_path)
                self.assertEqual(max_value, self.cpu_count / cores_per_job)
Example #3
0
 def makeWorkflow():
     job = Job()
     r1 = job.addService(TestServiceSerialization("woot1"))
     r2 = job.addService(TestServiceSerialization("woot2"))
     r3 = job.addService(TestServiceSerialization("woot3"))
     job.addChildFn(fnTest, [ r1, r2, r3 ], outFile)
     return job
Example #4
0
    def runNewCheckpointIsLeafVertexTest(self, createWorkflowFn):
        """
        Test verification that a checkpoint job is a leaf vertex using both
        valid and invalid cases.

        :param createWorkflowFn: function to create and new workflow and return a tuple of:

                                 0) the workflow root job
                                 1) a checkpoint job to test within the workflow

        """

        logger.info('Test checkpoint job that is a leaf vertex')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     expectedException=None)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a service')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobService=TrivialService("LeafTestService"),
                                     expectedException=JobGraphDeadlockException)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a child job')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobChild=Job.wrapJobFn(
                                         simpleJobFn, "LeafTestChild"),
                                     expectedException=JobGraphDeadlockException)

        logger.info('Test checkpoint job that is not a leaf vertex due to the presence of a follow-on job')
        self.runCheckpointVertexTest(*createWorkflowFn(),
                                     checkpointJobFollowOn=Job.wrapJobFn(
                                         simpleJobFn,
                                         "LeafTestFollowOn"),
                                     expectedException=JobGraphDeadlockException)
Example #5
0
 def __init__(self, tree, event, sleepTime, startTime, cores):
     Job.__init__(self, cores=cores)
     self.tree = tree
     self.event = event
     self.sleepTime = sleepTime
     self.startTime = startTime
     self.cores = cores
Example #6
0
 def testEncapsulation(self):
     """
     Tests the Job.encapsulation method, which uses the EncapsulationJob
     class.
     """
     # Temporary file
     outFile = getTempFile(rootDir=self._createTempDir())
     try:
         # Encapsulate a job graph
         a = T.wrapJobFn(encapsulatedJobFn, "A", outFile)
         a = a.encapsulate()
         # Now add children/follow to the encapsulated graph
         d = T.wrapFn(f, a.rv(), outFile)
         e = T.wrapFn(f, d.rv(), outFile)
         a.addChild(d)
         a.addFollowOn(e)
         # Create the runner for the workflow.
         options = T.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "INFO"
         # Run the workflow, the return value being the number of failed jobs
         T.Runner.startToil(a, options)
         # Check output
         self.assertEquals(open(outFile, 'r').readline(), "ABCDE")
     finally:
         os.remove(outFile)
Example #7
0
 def testCacheEjection(self):
     """
     Test cache always always ejects least recently created file
     """
     # Makes three jobs that create an output file each which they write to filestore.  The combined size of any two
     # files is always less that cacheSize but the combined size of all 3 is always more so 1 file always has to be
     # ejected. Test to ensure that A is always ejected regardless of size.
     #  Make a temp directory for the test
     test_dir = self._createTempDir()
     for test in xrange(10):
         options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "DEBUG"
         options.cacheSize = 100000
         options.retryCount=100
         options.badWorker=0.5
         options.badWorkerFailInterval = 1.0
         # Create a temp file to write teh test results
         handle, logfile = tempfile.mkstemp(dir=test_dir)
         os.close(handle)
         file_sizes = [50000, 40000, 30000]
         # Randomize to (potentially) test all combinations
         random.shuffle(file_sizes)
         # Run the workflow. A, B and C do teh cache operations, and D prints test status to tempFile
         A = Job.wrapJobFn(fileTestJob, file_sizes[0])
         B = Job.wrapJobFn(fileTestJob, file_sizes[0])
         C = Job.wrapJobFn(fileTestJob, file_sizes[0])
         D = Job.wrapJobFn(fileTestCache, A.rv(), B.rv(), C.rv(), logfile)
         A.addChild(B)
         B.addChild(C)
         C.addChild(D)
         Job.Runner.startToil(A, options)
         #  Assert jobs passed by reading test results from tempFile
         with open(logfile, 'r') as outfile:
             for test_status in outfile:
                 assert test_status.strip() == 'True'
Example #8
0
 def _deleteLocallyReadFilesFn(self, readAsMutable):
     self.options.retryCount = 0
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M')
     B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable,
                       memory='20M')
     A.addChild(B)
     Job.Runner.startToil(A, self.options)
Example #9
0
def main():
    """Restarts a toil workflow.
    """
    
    ##########################################
    #Construct the arguments.
    ##########################################  

    parser = getBasicOptionParser()

    parser.add_argument("--version", action='version', version=version)

    parser.add_argument("jobStore", type=str,
          help=("Store in which to place job management files \
          and the global accessed temporary files"
          "(If this is a file path this needs to be globally accessible "
          "by all machines running jobs).\n"
          "If the store already exists and restart is false an"
          " ExistingJobStoreException exception will be thrown."))

    options = parseBasicOptions(parser)
        
    ##########################################
    #Now run the toil construction/leader
    ##########################################  
        
    setLoggingFromOptions(options)
    options.restart = True
    with setupToil(options) as (config, batchSystem, jobStore):
        jobStore.clean(Job._loadRootJob(jobStore))
        mainLoop(config, batchSystem, jobStore, Job._loadRootJob(jobStore))
 def testEncapsulation(self):
     """
     Tests the Job.encapsulation method, which uses the EncapsulationJob
     class.
     """
     #Temporary file
     outFile = getTempFile(rootDir=os.getcwd())
     #Make a job graph
     a = T.wrapFn(f, "A", outFile)
     b = a.addChildFn(f, a.rv(), outFile)
     c = a.addFollowOnFn(f, b.rv(), outFile)
     #Encapsulate it
     a = a.encapsulate()
     #Now add children/follow to the encapsulated graph
     d = T.wrapFn(f, c.rv(), outFile)
     e = T.wrapFn(f, d.rv(), outFile)
     a.addChild(d)
     a.addFollowOn(e)
     #Create the runner for the workflow.
     options = T.Runner.getDefaultOptions()
     options.logLevel = "INFO"
     #Run the workflow, the return value being the number of failed jobs
     self.assertEquals(T.Runner.startToil(a, options), 0)
     T.Runner.cleanup(options) #This removes the jobStore
     #Check output
     self.assertEquals(open(outFile, 'r').readline(), "ABCDE")
     #Cleanup
     os.remove(outFile)
Example #11
0
 def testPromiseRequirementRaceStatic(self):
     """
     Checks for a race condition when using promised requirements and child job functions.
     """
     A = Job.wrapJobFn(logDiskUsage, 'A', sleep=5, disk=PromisedRequirement(1024))
     B = Job.wrapJobFn(logDiskUsage, 'B', disk=PromisedRequirement(lambda x: x + 1024, A.rv()))
     A.addChild(B)
     Job.Runner.startToil(A, self.getOptions(self._createTempDir('testFiles')))
Example #12
0
 def testAddChildEncapsulate(self):
     """
     Make sure that the encapsulate child does not have two pareents
     with unique roots.
     """
     # Temporary file
     a = T.wrapFn(noOp)
     b = T.wrapFn(noOp)
     a.addChild(b).encapsulate()
     self.assertEquals(len(a.getRootJobs()), 1)
Example #13
0
 def testReadCachHitFileFromJobStore(self):
     """
     Read a file from the file store that has a corresponding cached copy.  Ensure the number
     of links on the file are appropriate.
     """
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True)
     B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None,
                       fsID=A.rv())
     A.addChild(B)
     Job.Runner.startToil(A, self.options)
Example #14
0
 def testControlledFailedWorkerRetry(self):
     """
     Conduct a couple of job store operations.  Then die.  Ensure that the restarted job is
     tracking values in the cache state file appropriately.
     """
     workdir = self._createTempDir(purpose='nonLocalDir')
     self.options.retryCount = 1
     F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=2*1024*1024*1024, testDir=workdir,
                       disk='2G')
     G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M')
     F.addChild(G)
     Job.Runner.startToil(F, self.options)
Example #15
0
 def _deleteLocallyReadFilesFn(self, readAsMutable):
     self.options.retryCount = 0
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True, memory='10M')
     B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M')
     A.addChild(B)
     try:
         Job.Runner.startToil(A, self.options)
     except FailedJobsException as err:
         self.assertEqual(err.numberOfFailedJobs, 2)
         errMsg = self._parseAssertionError(self.options.logFile)
         if 'explicitly' not in errMsg:
             self.fail('Shouldn\'t see this')
Example #16
0
        def _testCacheMissFunction(self, cacheReadFile):
            """
            This is the function that actually does what the 2 cache miss functions want.

            :param cacheReadFile: Does the read file need to be cached(T) or not(F)
            """
            workdir = self._createTempDir(purpose='nonLocalDir')
            A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=False, nonLocalDir=workdir)
            B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False,
                              cacheReadFile=cacheReadFile, fsID=A.rv())
            A.addChild(B)
            Job.Runner.startToil(A, self.options)
Example #17
0
 def testToilIsNotBroken(self):
     """
     Runs a simple DAG to test if if any features other that caching were broken.
     """
     A = Job.wrapJobFn(self._uselessFunc)
     B = Job.wrapJobFn(self._uselessFunc)
     C = Job.wrapJobFn(self._uselessFunc)
     D = Job.wrapJobFn(self._uselessFunc)
     A.addChild(B)
     A.addChild(C)
     B.addChild(D)
     C.addChild(D)
     Job.Runner.startToil(A, self.options)
Example #18
0
 def testServiceSerialization(self):
     """
     Tests that a service can receive a promise without producing a serialization
     error.
     """
     job = Job()
     service = TestServiceSerialization("woot")
     startValue = job.addService(service) # Add a first service to job
     subService = TestServiceSerialization(startValue) # Now create a child of 
     # that service that takes the start value promise from the parent service
     job.addService(subService, parentService=service) # This should work if
     # serialization on services is working correctly.
     
     self.runToil(job)
Example #19
0
 def test_star(self):
     """
     Test the functionality of align_dna
     """
     univ_options = self._getTestUnivOptions()
     config_file = os.path.join(self._projectRootPath(), "src/protect/test/test_inputs/ci_parameters.yaml")
     test_src_folder = os.path.join(self._projectRootPath(), "src", "protect", "test")
     a = Job.wrapJobFn(self._get_test_star_files)
     b = Job.wrapJobFn(self._get_all_tools, config_file).encapsulate()
     c = Job.wrapJobFn(self._get_tool, b.rv(), "star")
     d = Job.wrapJobFn(align_rna, a.rv(), univ_options, c.rv()).encapsulate()
     a.addChild(b)
     b.addChild(c)
     c.addChild(d)
     Job.Runner.startToil(a, self.options)
Example #20
0
 def testJobFileStoreWithSmallCache(self, retryCount=0, badWorker=0.0, 
                      stringNo=1, stringLength=1000000, cacheSize=10000, testNo=2):
     """
     Creates a chain of jobs, each reading and writing files using the 
     Job.FileStore interface. Verifies the files written are always what we expect.
     The chain tests the caching behavior. 
     """
     for test in xrange(testNo):
         #Make a list of random strings, each of 100k chars and hash the first 200 
         #base prefix to the string
         def randomString():
             chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             s = "".join(map(lambda i : random.choice(chars), xrange(stringLength)))
             return s[:PREFIX_LENGTH], s
         #Total length is 2 million characters (20 strings of length 100K each) 
         testStrings = dict(map(lambda i : randomString(), xrange(stringNo)))
         options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
         options.logLevel = "INFO"
         options.cacheSize = cacheSize
         options.retryCount=retryCount
         options.badWorker=badWorker
         options.badWorkerFailInterval = 1.0
         chainLength = 10
         # Run the workflow, the return value being the number of failed jobs
         Job.Runner.startToil(Job.wrapJobFn(fileTestJob, [], 
                                            testStrings, chainLength), 
                              options)
Example #21
0
 def testWriteLocalFileToJobStore(self):
     """
     Write a file from the localTempDir to the job store.  Such a file will be cached by
     default.  Ensure the file is cached.
     """
     A = Job.wrapJobFn(self._writeFileToJobStore, isLocalFile=True)
     Job.Runner.startToil(A, self.options)
Example #22
0
    def test(self):
        """
        Tests that a toil workflow that fails once can be resumed without a NoSuchJobException.
        """
        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.logLevel = "INFO"
        root = Job.wrapJobFn(parent)
        with self.assertRaises(FailedJobsException):
            # This one is intended to fail.
            Job.Runner.startToil(root, options)

        # Resume the workflow. Unfortunately, we have to check for
        # this bug using the logging output, since although the
        # NoSuchJobException causes the worker to fail, the batch
        # system code notices that the job has been deleted despite
        # the failure and avoids the failure.
        options.restart = True
        tempDir = self._createTempDir()
        options.logFile = os.path.join(tempDir, "log.txt")
        Job.Runner.startToil(root, options)
        with open(options.logFile) as f:
            logString = f.read()
            # We are looking for e.g. "Batch system is reporting that
            # the jobGraph with batch system ID: 1 and jobGraph
            # store ID: n/t/jobwbijqL failed with exit value 1"
            self.assertTrue("failed with exit value" not in logString)
Example #23
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)

    parser.add_argument('--num-lines', default=1000, help='Number of lines in file to sort.', type=int)
    parser.add_argument('--line-length', default=50, help='Length of lines in file to sort.', type=int)
    parser.add_argument("--N",
                        help="The threshold below which a serial sort function is used to sort file. "
                        "All lines must of length less than or equal to N or program will fail",
                        default=10000)

    options = parser.parse_args()

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    file_name = 'file_to_sort.txt'
    make_file_to_sort(file_name=file_name, lines=options.num_lines, line_length=options.line_length)

    with Toil(options) as toil:
        sort_file_url = 'file://' + os.path.abspath('file_to_sort.txt')
        if not toil.options.restart:
            sort_file_id = toil.importFile(sort_file_url)
            sorted_file_id = toil.start(Job.wrapJobFn(setup, sort_file_id, int(options.N), False, memory='1000M'))
        else:
            sorted_file_id = toil.restart()
        toil.exportFile(sorted_file_id, sort_file_url)
Example #24
0
            def userScript():
                from toil.job import Job
                from toil.common import Toil

                # A user-defined type, i.e. a type defined in the user script
                class X(object):
                    pass

                # noinspection PyUnusedLocal
                def job(job, x, disk='10M', cores=1, memory='10M'):
                    return x

                if __name__ == '__main__':
                    options = Job.Runner.getDefaultArgumentParser().parse_args()
                    x = X()
                    with Toil(options) as toil:
                        r = toil.start(Job.wrapJobFn(job, x).encapsulate())
                    # Assert that the return value is of type X, but not X from the __main__
                    # module but X from foo.bar, the canonical name for the user module. The
                    # translation from __main__ to foo.bar is a side effect of hot-deployment.
                    assert r.__class__ is not X
                    import foo.bar
                    assert r.__class__ is foo.bar.X
                    # Assert that a copy was made. This is a side effect of pickling/unpickling.
                    assert x is not r
Example #25
0
def main(args):
    """
    Parses command line arguments and do the work of the program.
    "args" specifies the program arguments, with args[0] being the executable
    name. The return value should be used as the program's exit code.
    """
    
    if len(args) == 2 and args[1] == "--test":
        # Run the tests
        return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
    
    options = parse_args(args) # This holds the nicely-parsed options object
    
    RealTimeLogger.start_master()
    
    # Make a root job
    root_job = Job.wrapJobFn(collate_all, options,
        cores=1, memory="1G", disk="1G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
        
    print("All jobs completed successfully")
    
    RealTimeLogger.stop_master()
def main():
    """
    This is a Toil pipeline for the UNC best practice RNA-Seq analysis.
    RNA-seq fastqs are combined, aligned, sorted, filtered, and quantified.

    Please read the README.md located in the same directory.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'config': args.config,
              'config_fastq': args.config_fastq,
              'input': args.input,
              'unc.bed': args.unc,
              'hg19.transcripts.fa': args.fasta,
              'composite_exons.bed': args.composite_exons,
              'normalize.pl': args.normalize,
              'output_dir': args.output_dir,
              'rsem_ref.zip': args.rsem_ref,
              'chromosomes.zip': args.chromosomes,
              'ebwt.zip': args.ebwt,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'sudo': args.sudo,
              'single_end_reads': args.single_end_reads,
              'upload_bam_to_s3': args.upload_bam_to_s3,
              'uuid': None,
              'sample.tar': None,
              'cpu_count': None}

    # Launch jobs
    Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), args)
def main():
    """
    This is a Toil pipeline to transfer TCGA data into an S3 Bucket

    Data is pulled down with Genetorrent and transferred to S3 via S3AM.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'genetorrent': args.genetorrent,
              'genetorrent_key': args.genetorrent_key,
              'ssec': args.ssec,
              's3_dir': args.s3_dir}
    # Sanity checks
    if args.ssec:
        assert os.path.isfile(args.ssec)
    if args.genetorrent:
        assert os.path.isfile(args.genetorrent)
    if args.genetorrent_key:
        assert os.path.isfile(args.genetorrent_key)
    samples = parse_genetorrent(args.genetorrent)
    # Start pipeline
    # map_job accepts a function, an iterable, and *args. The function is launched as a child
    # process with one element from the iterable and *args, which in turn spawns a tree of child jobs.
    Job.Runner.startToil(Job.wrapJobFn(map_job, download_and_transfer_sample, samples, inputs), args)
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
Example #29
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)

    parser.add_argument("--fileToSort", dest="fileToSort",
                      help="The file you wish to sort")

    parser.add_argument("--N", dest="N",
                      help="The threshold below which a serial sort function is"
                      "used to sort file. All lines must of length less than or equal to N or program will fail",
                      default=10000)

    options = parser.parse_args()

    if options.fileToSort is None:
        raise RuntimeError("No file to sort given")

    if not os.path.exists(options.fileToSort):
        raise RuntimeError("File to sort does not exist: %s" % options.fileToSort)

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    #Now we are ready to run
    Job.Runner.startToil(Job.wrapJobFn(setup, options.fileToSort, int(options.N),
                                       memory=sortMemory), options)
def align_transcripts(args, toil_options):
    """
    Main entry function for transcript alignment toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta)
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path)
            input_file_ids.modes = {}
            file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp,
                        input_file_ids.ref_db]
            for mode in args.transcript_modes:
                input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp'])
                file_ids.append(input_file_ids.modes[mode])
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage)
            results_file_ids = t.start(job)
        else:
            results_file_ids = t.restart()
        for file_path, file_id in results_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(file_path)
            t.exportFile(file_id, 'file://' + file_path)
Example #31
0
def run_whole_alignment(job,
                        context,
                        fastq,
                        gam_input_reads,
                        bam_input_reads,
                        sample_name,
                        interleaved,
                        mapper,
                        indexes,
                        reads_chunk_ids,
                        bam_output=False,
                        surject=False,
                        gbwt_penalty=None,
                        validate=False):
    """
    align all fastq chunks in parallel
    
    Takes a dict from index type to index file ID. Some indexes are extra and
    specifying them will change mapping behavior.
    
    Returns a list of per-contig GAMs, the total allignment runtime, and a list
    of per-contig BAM file IDs (which is only nonempty when surject is true).
    
    """

    # this will be a list of lists.
    # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    gam_chunk_file_ids = []
    gam_chunk_running_times = []
    # depending on bam_output and surject options, we can make bam_output too
    bam_chunk_file_ids = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph alignment on each fastq chunk
        chunk_alignment_job = child_job.addChildJobFn(
            run_chunk_alignment,
            context,
            gam_input_reads,
            bam_input_reads,
            sample_name,
            interleaved,
            mapper,
            chunk_filename_ids,
            chunk_id,
            indexes,
            bam_output=bam_output,
            gbwt_penalty=gbwt_penalty,
            validate=validate,
            cores=context.config.alignment_cores,
            memory=context.config.alignment_mem,
            disk=context.config.alignment_disk)
        if not bam_output:
            gam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        else:
            bam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        gam_chunk_running_times.append(chunk_alignment_job.rv(1))

    if not bam_output:
        merge_gams_job = child_job.addFollowOnJobFn(
            run_merge_gams,
            context,
            sample_name,
            indexes.get('id_ranges'),
            gam_chunk_file_ids,
            gam_chunk_running_times,
            cores=context.config.misc_cores,
            memory=context.config.misc_mem,
            disk=context.config.misc_disk)
        gam_chrom_ids = merge_gams_job.rv(0)
        gam_chunk_time = merge_gams_job.rv(1)
        bam_chrom_ids = []
    else:
        gam_chrom_ids = []
        gam_chunk_time = None
        merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context,
                                                    sample_name,
                                                    bam_chunk_file_ids)
        bam_chrom_ids = [merge_bams_job.rv()]

    if surject:
        interleaved_surject = interleaved or (fastq and len(fastq) == 2)
        zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context,
                                             gam_chunk_file_ids)
        xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes[
            'xg']
        bam_chrom_ids = [
            zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(),
                                     sample_name + '-surject',
                                     interleaved_surject, xg_id, []).rv()
        ]

    return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
Example #32
0
 def __init__(self, magma_bin, batch_results):
     Job.__init__(self, memory="100M", cores=1, disk="100M")
     self.magma_bin = magma_bin
     # list of dicts containing file IDs from all gene tests
     self.batch_results = batch_results
Example #33
0
    def testNestedResourcesDoNotBlock(self):
        """
        Resources are requested in the order Memory > Cpu > Disk.
        Test that inavailability of cpus for one job that is scheduled does not block another job
        that can run.
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        options.maxCores = 4
        from toil import physicalMemory
        availableMemory = physicalMemory()
        options.batchSystem = self.batchSystemName

        outFile = os.path.join(tempDir, 'counter')
        open(outFile, 'w').close()

        root = Job()

        blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b',
                             cores=2, memory='1M', disk='1M')
        firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ',
                              cores=1, memory='1M', disk='1M')
        secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10,
                               writeVal='sJ', cores=1, memory='1M', disk='1M')

        # Should block off 50% of memory while waiting for it's 3 cores
        firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0,
                                   writeVal='fJC', cores=3, memory=int(availableMemory/2), disk='1M')

        # These two shouldn't be able to run before B because there should be only
        # (50% of memory - 1M) available (firstJobChild should be blocking 50%)
        secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                    writeVal='sJC', cores=2, memory=int(availableMemory/1.5),
                                    disk='1M')
        secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                         writeVal='sJGC', cores=2, memory=int(availableMemory/1.5),
                                         disk='1M')

        root.addChild(blocker)
        root.addChild(firstJob)
        root.addChild(secondJob)

        firstJob.addChild(firstJobChild)
        secondJob.addChild(secondJobChild)

        secondJobChild.addChild(secondJobGrandChild)
        """
        The tree is:
                    root
                  /   |   \
                 b    fJ   sJ
                      |    |
                      fJC  sJC
                           |
                           sJGC
        But the order of execution should be
        root > b , fJ, sJ > sJC > sJGC > fJC
        since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the
        resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC
        should not block them, and should only run after they finish.
        """
        Job.Runner.startToil(root, options)
        with open(outFile) as oFH:
            outString = oFH.read()
        # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same
        # time. We look for all possible permutations.
        possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])])
        assert outString.startswith(possibleStarts)
        assert outString.endswith('sJCsJGCfJC')
Example #34
0
 def __init__(self, fileId):
     Job.__init__(self)
     self.fileId = fileId
Example #35
0
def run_cactus_align(job,
                     configWrapper,
                     cactusWorkflowArguments,
                     project,
                     checkpointInfo,
                     doRenaming,
                     pafInput,
                     pafSecondaries,
                     doVG,
                     doGFA,
                     delay=0,
                     eventNameAsID=False,
                     referenceEvent=None,
                     pafMaskFilter=None):
    # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time
    time.sleep(delay)

    head_job = Job()
    job.addChild(head_job)

    # unzip the input sequences if necessary, and also extract paf masking beds
    preprocess_job = head_job.addChildJobFn(preprocess_input_sequences,
                                            configWrapper, project,
                                            cactusWorkflowArguments,
                                            pafMaskFilter, referenceEvent)
    no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs
    cactusWorkflowArguments = preprocess_job.rv(0)
    mask_beds = preprocess_job.rv(1)

    # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc
    # and the cigar files match up.  If reading cactus-blast output, the cigars are fine, just need
    # the fastas (todo: make this less hacky somehow)
    cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids,
                                        cactusWorkflowArguments, project,
                                        doRenaming, eventNameAsID, mask_beds
                                        #todo disk=
                                        )
    cactusWorkflowArguments = cur_job.rv(0)
    mask_bed_id = cur_job.rv(1)

    # allow for input in paf format:
    if pafInput:
        # convert the paf input to lastz format, splitting out into primary and secondary files
        # optionally apply the masking
        cur_job = cur_job.addFollowOnJobFn(mask_and_convert_paf,
                                           cactusWorkflowArguments,
                                           pafSecondaries, mask_bed_id)
        cactusWorkflowArguments = cur_job.rv()

    if no_ingroup_coverage:
        # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage
        cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage,
                                           cactusWorkflowArguments, project)
        cactusWorkflowArguments = cur_job.rv()

    # run cactus setup all the way through cactus2hal generation
    setup_job = cur_job.addFollowOnJobFn(run_setup_phase,
                                         cactusWorkflowArguments)

    # set up the project
    prepare_hal_export_job = setup_job.addFollowOnJobFn(
        run_prepare_hal_export, project, setup_job.rv())

    # create the hal
    hal_export_job = prepare_hal_export_job.addFollowOnJobFn(
        exportHal,
        prepare_hal_export_job.rv(0),
        event=prepare_hal_export_job.rv(1),
        checkpointInfo=checkpointInfo,
        acyclicEvent=referenceEvent,
        memory=configWrapper.getDefaultMemory(),
        disk=configWrapper.getExportHalDisk(),
        preemptable=False)

    # optionally create the VG
    if doVG or doGFA:
        vg_export_job = hal_export_job.addFollowOnJobFn(
            export_vg,
            hal_export_job.rv(),
            configWrapper,
            doVG,
            doGFA,
            checkpointInfo=checkpointInfo)
        vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1)
    else:
        vg_file_id, gfa_file_id = None, None

    return hal_export_job.rv(), vg_file_id, gfa_file_id
Example #36
0
        refDict_og = "human_g1k_b37_20.dict"
        refFasta = toil.importFile(
            "file:///home/lifeisaboutfishtacos/Desktop/wdl-tutorials/data/ref/human_g1k_b37_20.fasta"
        )
        refFasta_og = "human_g1k_b37_20.fasta"
        gatk = toil.importFile(
            "file:///home/lifeisaboutfishtacos/Desktop/wdl-tutorials/GenomeAnalysisTK.jar"
        )
        gatk_og = "GenomeAnalysisTK.jar"

        # Output Variables
        GVCF2_og = "GVCF2_rawLikelihoods.g.vcf"
        GVCF3_og = "GVCF3_rawLikelihoods.g.vcf"
        GVCF1_og = "GVCF1_rawLikelihoods.g.vcf"

        job0 = Job.wrapJobFn(initialize_jobs)
        job1 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta,
                             refFasta_og, refIndex, refIndex_og, refDict,
                             refDict_og, sample0_1, sample1_1, sample1_1_og,
                             sample2_1, sample2_1_og)
        job2 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta,
                             refFasta_og, refIndex, refIndex_og, refDict,
                             refDict_og, sample0_2, sample1_2, sample1_2_og,
                             sample2_2, sample2_2_og)
        job3 = Job.wrapJobFn(HaplotypeCallerERC, gatk, gatk_og, refFasta,
                             refFasta_og, refIndex, refIndex_og, refDict,
                             refDict_og, sample0_3, sample1_3, sample1_3_og,
                             sample2_3, sample2_3_og)
        job4 = Job.wrapJobFn(GenotypeGVCFs, gatk, gatk_og, refFasta,
                             refFasta_og, refIndex, refIndex_og, refDict,
                             refDict_og, 'CEUtrio', job2.rv(), GVCF2_og,
Example #37
0
 def testNonCachingFileStore(self):
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.disableCaching = True
     with Toil(options) as workflow:
         workflow.start(Job.wrapJobFn(simpleFileStoreJob))
Example #38
0
 def __init__(self):
     Job.__init__(self)
Example #39
0
from toil.common import Toil
from toil.job import Job


def helloWorld(message, memory="1G", cores=1, disk="1G"):
    return "Hello, world!, here's a message: %s" % message


if __name__ == "__main__":
    parser = Job.Runner.getDefaultArgumentParser()
    options = parser.parse_args()
    options.clean = "always"
    with Toil(options) as toil:
        output = toil.start(Job.wrapFn(helloWorld, "You did it!"))
    print(output)
Example #40
0
from toil.common import Toil
from toil.job import Job


def helloWorld(message, memory="2G", cores=2, disk="3G"):
    return f"Hello, world!, here's a message: {message}"


if __name__ == "__main__":
    options = Job.Runner.getDefaultOptions("./toilWorkflowRun")
    options.logLevel = "OFF"
    options.clean = "always"

    hello_job = Job.wrapFn(helloWorld, "Woot")

    with Toil(options) as toil:
        print(toil.start(hello_job))  # prints "Hello, world!, ..."
Example #41
0
    # Write another file using a stream; fileID2 is the
    # key for this second file.
    with job.fileStore.writeGlobalFileStream(cleanup=True) as (fH, fileID2):
        fH.write(b"Out brief candle")

    # Now read the first file; scratchFile2 is a local copy of the file that is read-only by default.
    scratchFile2 = job.fileStore.readGlobalFile(fileID)

    # Read the second file to a desired location: scratchFile3.
    scratchFile3 = os.path.join(job.tempDir, "foo.txt")
    job.fileStore.readGlobalFile(fileID2, userPath=scratchFile3)

    # Read the second file again using a stream.
    with job.fileStore.readGlobalFileStream(fileID2) as fH:
        print(fH.read())  # This prints "Out brief candle"

    # Delete the first file from the global file-store.
    job.fileStore.deleteGlobalFile(fileID)

    # It is unnecessary to delete the file keyed by fileID2 because we used the cleanup flag,
    # which removes the file after this job and all its successors have run (if the file still exists)


if __name__ == "__main__":
    options = Job.Runner.getDefaultOptions("./toilWorkflowRun")
    options.logLevel = "INFO"
    options.clean = "always"

    with Toil(options) as toil:
        toil.start(Job.wrapJobFn(globalFileStoreJobFn))
Example #42
0
def runCactusGraphMapSplit(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            split_id_map = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the contigs if any
            ref_contigs = set(options.refContigs)
            # todo: use import?
            if options.refContigsFile:
                with open(options.refContigsFile, 'r') as rc_file:
                    for line in rc_file:
                        if len(line.strip()):
                            ref_contigs.add(line.strip().split()[0])

            if options.otherContig:
                assert options.otherContig not in ref_contigs

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "graphmap"),
                                            "assemblyName",
                                            default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the paf
            paf_id = toil.importFile(makeURL(options.graphmapPAF))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([
                seqFile.tree.getName(node)
                for node in seqFile.tree.getLeaves()
            ])

            if graph_event not in leaves:
                raise RuntimeError(
                    "Minigraph name {} not found in seqfile".format(
                        graph_event))
            if options.reference and options.reference not in leaves:
                raise RuntimeError(
                    "Name given with --reference {} not found in seqfile".
                    format(options.reference))

            for genome, seq in seqFile.pathMap.items():
                if genome in leaves:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            split_id_map = toil.start(
                Job.wrapJobFn(graphmap_split_workflow, options, config,
                              seqIDMap, gfa_id, options.minigraphGFA, paf_id,
                              options.graphmapPAF, ref_contigs,
                              options.otherContig))

        #export the split data
        export_split_data(toil, seqIDMap, split_id_map, options.outDir, config)
Example #43
0
import argparse
import os

from toil.job import Job


def f0(job):
    if 'FAIL' in os.environ:
        raise RuntimeError('failed on purpose')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    Job.Runner.addToilOptions(parser)
    options = parser.parse_args()
    rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M')
    Job.Runner.startToil(rootJob, options)
Example #44
0
 def createWorkflow():
     rootJob = Job.wrapJobFn(simpleJobFn, "Parent")
     childCheckpointJob = rootJob.addChildJobFn(simpleJobFn,
                                                "Child",
                                                checkpoint=True)
     return rootJob, childCheckpointJob
Example #45
0
    def testDockerClean(self,
                        disableCaching=True,
                        detached=True,
                        rm=True,
                        deferParam=None):
        """
        Run the test container that creates a file in the work dir, and sleeps
        for 5 minutes.
        Ensure that the calling job gets SIGKILLed after a minute, leaving
        behind the spooky/ghost/zombie container. Ensure that the container is
        killed on batch system shutdown (through the deferParam mechanism).
        """

        # We need to test the behaviour of `deferParam` with `rm` and
        # `detached`. We do not look at the case where `rm` and `detached` are
        # both True.  This is the truth table for the different combinations at
        # the end of the test. R = Running, X = Does not exist, E = Exists but
        # not running.
        #              None     FORGO     STOP    RM
        #    rm        X         R         X      X
        # detached     R         R         E      X
        #  Neither     R         R         E      X

        data_dir = os.path.join(self.tempDir, 'data')
        working_dir = os.path.join(self.tempDir, 'working')
        test_file = os.path.join(working_dir, 'test.txt')

        mkdir_p(data_dir)
        mkdir_p(working_dir)

        options = Job.Runner.getDefaultOptions(
            os.path.join(self.tempDir, 'jobstore'))
        options.logLevel = self.dockerTestLogLevel
        options.workDir = working_dir
        options.clean = 'always'
        options.disableCaching = disableCaching

        # No base64 logic since it might create a name starting with a `-`.
        container_name = uuid.uuid4().hex
        A = Job.wrapJobFn(_testDockerCleanFn, working_dir, detached, rm,
                          deferParam, container_name)
        try:
            Job.Runner.startToil(A, options)
        except FailedJobsException:
            # The file created by spooky_container would remain in the directory
            # and since it was created inside the container, it would have had
            # uid and gid == 0 (root) which may cause problems when docker
            # attempts to clean up the jobstore.
            file_stats = os.stat(test_file)
            assert file_stats.st_gid != 0
            assert file_stats.st_uid != 0

            if (rm and (deferParam != FORGO)) or deferParam == RM:
                # These containers should not exist
                assert containerIsRunning(container_name) is None, \
                    'Container was not removed.'

            elif deferParam == STOP:
                # These containers should exist but be non-running
                assert containerIsRunning(container_name) == False, \
                    'Container was not stopped.'

            else:
                # These containers will be running
                assert containerIsRunning(container_name) == True, \
                    'Container was not running.'
        client = docker.from_env(version='auto')
        dockerKill(container_name, client)
        try:
            os.remove(test_file)
        except:
            pass
Example #46
0
 def test(self):
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.logLevel = 'INFO'
     root = Job.wrapJobFn(d)
     self.assertEqual(Job.Runner.startToil(root, options), ('b', 43, 3))
Example #47
0
            def userScript():
                import os
                import time
                from toil.job import Job
                from toil.common import Toil
                from toil.leader import FailedJobsException

                TIMEOUT = 10

                def root(rootJob):
                    def nullFile():
                        return rootJob.fileStore.jobStore.importFile(
                            'file:///dev/null')

                    startFile = nullFile()
                    endFile = nullFile()

                    rootJob.addChildJobFn(deferring, startFile, endFile)
                    encapsulatedJob = Job.wrapJobFn(encapsulated, startFile)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatedJob.addChildFn(dummy)
                    encapsulatedJob.addFollowOnJobFn(trigger, endFile)
                    encapsulatingJob = encapsulatedJob.encapsulate()
                    rootJob.addChild(encapsulatingJob)

                def dummy():
                    pass

                def deferredFile(config):
                    """
                    Return path to a file at the root of the job store, exploiting the fact that
                    the job store is shared between leader and worker container.
                    """
                    prefix = 'file:'
                    locator = config.jobStore
                    assert locator.startswith(prefix)
                    return os.path.join(locator[len(prefix):],
                                        'testDeferredFile')

                def deferred(deferredFilePath):
                    """
                    The deferred function that is supposed to run.
                    """
                    os.unlink(deferredFilePath)

                # noinspection PyUnusedLocal
                def deferring(job, startFile, endFile):
                    """
                    A job that adds the deferred function and then crashes once the `trigger` job
                    tells it to.
                    """
                    job.defer(deferred, deferredFile(job._config))
                    jobStore = job.fileStore.jobStore
                    jobStore.deleteFile(startFile)
                    with jobStore.updateFileStream(endFile) as fH:
                        fH.write(str(os.getpid()))
                    timeout = time.time() + TIMEOUT
                    while jobStore.fileExists(endFile):
                        assert time.time() < timeout
                        time.sleep(1)
                    os.kill(os.getpid(), 9)

                def encapsulated(job, startFile):
                    """
                    A job that waits until the `deferring` job is running and waiting to be crashed.
                    """
                    timeout = time.time() + TIMEOUT
                    while job.fileStore.jobStore.fileExists(startFile):
                        assert time.time() < timeout
                        time.sleep(1)

                def trigger(job, endFile):
                    """
                    A job that determines the PID of the worker running the `deferring` job,
                    tells the `deferring` job to crash and then waits for the corresponding
                    worker process to end. By waiting we can be sure that the `follow-on` job
                    finds the left-overs of the `deferring` job.
                    """
                    import errno
                    jobStore = job.fileStore.jobStore
                    with jobStore.readFileStream(endFile) as fH:
                        pid = int(fH.read())
                    os.kill(pid, 0)
                    jobStore.deleteFile(endFile)
                    timeout = time.time() + TIMEOUT
                    while True:
                        try:
                            os.kill(pid, 0)
                        except OSError as e:
                            if e.errno == errno.ESRCH:
                                break
                            else:
                                raise
                        else:
                            assert time.time() < timeout
                            time.sleep(1)

                def tryUnlink(deferredFilePath):
                    try:
                        os.unlink(deferredFilePath)
                    except OSError as e:
                        if e.errno == errno.ENOENT:
                            pass
                        else:
                            raise

                if __name__ == '__main__':
                    import errno
                    options = Job.Runner.getDefaultArgumentParser().parse_args(
                    )
                    with Toil(options) as toil:
                        deferredFilePath = deferredFile(toil.config)
                        open(deferredFilePath, 'w').close()
                        try:
                            assert os.path.exists(deferredFilePath)
                            try:
                                toil.start(Job.wrapJobFn(root))
                            except FailedJobsException as e:
                                assert e.numberOfFailedJobs == 2  # `root` and `deferring`
                                assert not os.path.exists(deferredFilePath), \
                                    'Apparently, the deferred function did not run.'
                            else:
                                assert False, 'Workflow should not have succeeded.'
                        finally:
                            tryUnlink(deferredFilePath)
Example #48
0
 def createWorkflow():
     rootJob = Job.wrapJobFn(simpleJobFn, "Root", checkpoint=True)
     return rootJob, rootJob
Example #49
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="*",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outHal",
                        type=str,
                        help="Output HAL file (or directory in --batch mode)")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Pangenome Options
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings."
        " The overridden configuration will be saved in <outHal>.pg-conf.xml")
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument(
        "--usePafSecondaries",
        action="store_true",
        help=
        "use the secondary alignments from the PAF input.  They are ignored by default."
    )
    parser.add_argument("--singleCopySpecies",
                        type=str,
                        help="Filter out all self-alignments in given species")
    parser.add_argument(
        "--barMaskFilter",
        type=int,
        default=None,
        help=
        "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)"
    )
    parser.add_argument(
        "--pafMaskFilter",
        type=int,
        default=None,
        help=
        "softmasked (query) regions greather than this length will be removed from the input PAF before it is processed"
    )
    parser.add_argument(
        "--outVG",
        action="store_true",
        help="export pangenome graph in VG (.vg) in addition to HAL")
    parser.add_argument(
        "--outGFA",
        action="store_true",
        help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL")
    parser.add_argument(
        "--batch",
        action="store_true",
        help=
        "Launch batch of alignments.  Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit"
    )
    parser.add_argument(
        "--stagger",
        type=int,
        help=
        "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)",
        default=0)
    parser.add_argument(
        "--reference",
        type=str,
        help=
        "Ensure that given genome is acyclic by deleting all paralogy edges in postprocessing, also do not mask its PAF mappings"
    )

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    options.buildHal = True
    options.buildFasta = True

    if options.outHal.startswith('s3://'):
        if not has_s3:
            raise RuntimeError(
                "S3 support requires toil to be installed with [aws]")
        # write a little something to the bucket now to catch any glaring problems asap
        test_file = os.path.join(getTempDirectory(), 'check')
        with open(test_file, 'w') as test_o:
            test_o.write("\n")
        region = get_aws_region(
            options.jobStore) if options.jobStore.startswith('aws:') else None
        write_s3(test_file,
                 options.outHal if options.outHal.endswith('.hal') else
                 os.path.join(options.outHal, 'test'),
                 region=region)
        options.checkpointInfo = (get_aws_region(options.jobStore),
                                  options.outHal)
    else:
        options.checkpointInfo = None

    if options.batch:
        # the output hal is a directory, make sure it's there
        if not os.path.isdir(options.outHal):
            os.makedirs(options.outHal)
        assert len(options.cigarsFile) == 0
    else:
        assert len(options.cigarsFile) > 0

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # We set which type of unique ids to expect.  Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap)
    # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy
    # But I don't think there's a real use case yet of making a separate parameter
    options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID')
    if options.eventNameAsID is not None:
        options.eventNameAsID = False if not bool(
            eventName) or eventName == '0' else True
    else:
        options.eventNameAsID = options.pangenome or options.pafInput
    os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str(
        int(options.eventNameAsID))

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            align_jobs = make_batch_align_jobs(options, toil)
            results_dict = toil.start(
                Job.wrapJobFn(run_batch_align_jobs, align_jobs))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
            else:
                assert len(results_dict) == 1 and None in results_dict
                halID, vgID, gfaID = results_dict[None][0], results_dict[None][
                    1], results_dict[None][2]
                # export the hal
                toil.exportFile(halID, makeURL(options.outHal))
                # export the vg
                if options.outVG:
                    toil.exportFile(
                        vgID,
                        makeURL(os.path.splitext(options.outHal)[0] + '.vg'))
                if options.outGFA:
                    toil.exportFile(
                        gfaID,
                        makeURL(
                            os.path.splitext(options.outHal)[0] + '.gfa.gz'))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
Example #50
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.reference:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.reference not in leaves:
            raise RuntimeError(
                "Genome specified with --reference, {}, not found in tree leaves"
                .format(options.reference))

    if options.pafMaskFilter and not options.pafInput:
        raise RuntimeError("--pafMaskFilter can only be run with --pafInput")

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              referenceEvent=options.reference,
                              pafMaskFilter=options.pafMaskFilter)
    return align_job
Example #51
0
def nextChainableJobGraph(jobGraph, jobStore):
    """Returns the next chainable jobGraph after this jobGraph if one
    exists, or None if the chain must terminate.
    """
    #If no more jobs to run or services not finished, quit
    if len(jobGraph.stack) == 0 or len(
            jobGraph.services) > 0 or jobGraph.checkpoint != None:
        logger.debug(
            "Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
            len(jobGraph.stack), len(jobGraph.services),
            jobGraph.checkpoint != None)
        return None

    #Get the next set of jobs to run
    jobs = jobGraph.stack[-1]
    assert len(jobs) > 0

    #If there are 2 or more jobs to run in parallel we quit
    if len(jobs) >= 2:
        logger.debug(
            "No more jobs can run in series by this worker,"
            " it's got %i children",
            len(jobs) - 1)
        return None

    #We check the requirements of the jobGraph to see if we can run it
    #within the current worker
    successorJobNode = jobs[0]
    if successorJobNode.memory > jobGraph.memory:
        logger.debug("We need more memory for the next job, so finishing")
        return None
    if successorJobNode.cores > jobGraph.cores:
        logger.debug("We need more cores for the next job, so finishing")
        return None
    if successorJobNode.disk > jobGraph.disk:
        logger.debug("We need more disk for the next job, so finishing")
        return None
    if successorJobNode.preemptable != jobGraph.preemptable:
        logger.debug(
            "Preemptability is different for the next job, returning to the leader"
        )
        return None
    if successorJobNode.predecessorNumber > 1:
        logger.debug(
            "The jobGraph has multiple predecessors, we must return to the leader."
        )
        return None

    # Load the successor jobGraph
    successorJobGraph = jobStore.load(successorJobNode.jobStoreID)

    # Somewhat ugly, but check if job is a checkpoint job and quit if
    # so
    if successorJobGraph.command.startswith("_toil "):
        #Load the job
        successorJob = Job._loadJob(successorJobGraph.command, jobStore)

        # Check it is not a checkpoint
        if successorJob.checkpoint:
            logger.debug("Next job is checkpoint, so finishing")
            return None

    # Made it through! This job is chainable.
    return successorJobGraph
Example #52
0
 def __init__(self, daner_file):
     Job.__init__(self, memory="100M", cores=1, disk="100M")
     self.daner_file = daner_file
Example #53
0
def map_main(context, options):
    """
    Wrapper for vg map. 
    """

    validate_map_options(context, options)

    # How long did it take to run the entire pipeline, in seconds?
    run_time_pipeline = None

    # Mark when we start the pipeline
    start_time_pipeline = timeit.default_timer()

    with context.get_toil(options.jobStore) as toil:
        if not toil.options.restart:

            importer = AsyncImporter(toil)

            # Make an index collection
            indexes = {}

            # Upload each index we have
            if options.xg_index is not None:
                indexes['xg'] = importer.load(options.xg_index)
            if options.gcsa_index is not None:
                indexes['gcsa'] = importer.load(options.gcsa_index)
                indexes['lcp'] = importer.load(options.gcsa_index + ".lcp")
            if options.gbwt_index is not None:
                indexes['gbwt'] = importer.load(options.gbwt_index)
            if options.distance_index is not None:
                indexes['distance'] = importer.load(options.distance_index)
            if options.minimizer_index is not None:
                indexes['minimizer'] = importer.load(options.minimizer_index)
            if options.snarls_index is not None:
                indexes['snarls'] = importer.load(options.snarls_index)
            if options.id_ranges is not None:
                indexes['id_ranges'] = importer.load(options.id_ranges)

            # Upload other local files to the remote IO Store
            inputReadsFileIDs = []
            if options.fastq:
                for sample_reads in options.fastq:
                    inputReadsFileIDs.append(importer.load(sample_reads))
            elif options.gam_input_reads:
                inputReadsFileIDs.append(importer.load(
                    options.gam_input_reads))
            else:
                assert options.bam_input_reads
                inputReadsFileIDs.append(importer.load(
                    options.bam_input_reads))

            importer.wait()

            # Make a root job
            root_job = Job.wrapJobFn(
                run_mapping,
                context,
                options.fastq,
                options.gam_input_reads,
                options.bam_input_reads,
                options.sample_name,
                options.interleaved,
                options.mapper,
                importer.resolve(indexes),
                reads_file_ids=importer.resolve(inputReadsFileIDs),
                bam_output=options.bam_output,
                surject=options.surject,
                validate=options.validate,
                cores=context.config.misc_cores,
                memory=context.config.misc_mem,
                disk=context.config.misc_disk)

            # Init the outstore
            init_job = Job.wrapJobFn(run_write_info_to_outstore,
                                     context,
                                     sys.argv,
                                     memory=context.config.misc_mem,
                                     disk=context.config.misc_disk)
            init_job.addFollowOn(root_job)

            # Run the job and store the returned list of output files to download
            toil.start(init_job)
        else:
            toil.restart()

    end_time_pipeline = timeit.default_timer()
    run_time_pipeline = end_time_pipeline - start_time_pipeline

    logger.info(
        "All jobs completed successfully. Pipeline took {} seconds.".format(
            run_time_pipeline))
Example #54
0
def run_mapping(job,
                context,
                fastq,
                gam_input_reads,
                bam_input_reads,
                sample_name,
                interleaved,
                mapper,
                indexes,
                reads_file_ids=None,
                reads_chunk_ids=None,
                bam_output=False,
                surject=False,
                gbwt_penalty=None,
                validate=False):
    """
    Split the fastq, then align each chunk.
    
    Exactly one of fastq, gam_input_reads, or bam_input_reads should be
    non-falsey, to indicate what kind of data the file IDs in reads_file_ids or
    reads_chunk_ids correspond to.
    
    Exactly one of reads_file_ids or read_chunks_ids should be specified.
    reads_file_ids holds a list of file IDs of non-chunked input read files,
    which will be chunked if necessary. reads_chunk_ids holds lists of chunk
    IDs for each read file, as produced by run_split_reads_if_needed.
    
    indexes is a dict from index type ('xg', 'gcsa', 'lcp', 'id_ranges',
    'gbwt', 'minimizer', 'distance', 'snarls') to index file ID. Some indexes
    are extra and specifying them will change mapping behavior. Some indexes
    are required for certain values of mapper.
    
    mapper can be 'map', 'mpmap', or 'gaffe'. For 'map' and 'mpmap', the 'gcsa'
    and 'lcp' indexes are required. For 'gaffe', the 'gbwt', 'minimizer' and
    'distance' indexes are required. All the mappers require the 'xg' index.
    
    If bam_output is set, produce BAMs. If surject is set, surject reads down
    to paths. 
    
    If the 'gbwt' index is present and gbwt_penalty is specified, the default
    recombination penalty will be overridden.
    
    returns output gams, one per chromosome, the total mapping time (excluding
    toil-vg overhead such as transferring and splitting files), and output
    BAMs, one per chromosome, if computed.
    """

    # Make sure we have exactly one type of input
    assert (bool(fastq) + bool(gam_input_reads) + bool(bam_input_reads) == 1)

    # Make sure we have exactly one kind of file IDs
    assert (bool(reads_file_ids) + bool(reads_chunk_ids) == 1)

    # We may have to have a job to chunk the reads
    chunk_job = None

    if reads_chunk_ids is None:
        # If the reads are not pre-chunked for us, we have to chunk them.
        chunk_job = job.addChildJobFn(run_split_reads_if_needed,
                                      context,
                                      fastq,
                                      gam_input_reads,
                                      bam_input_reads,
                                      reads_file_ids,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk)
        reads_chunk_ids = chunk_job.rv()

    # We need a job to do the alignment
    align_job = Job.wrapJobFn(run_whole_alignment,
                              context,
                              fastq,
                              gam_input_reads,
                              bam_input_reads,
                              sample_name,
                              interleaved,
                              mapper,
                              indexes,
                              reads_chunk_ids,
                              bam_output=bam_output,
                              surject=surject,
                              gbwt_penalty=gbwt_penalty,
                              validate=validate,
                              cores=context.config.misc_cores,
                              memory=context.config.misc_mem,
                              disk=context.config.misc_disk)

    if chunk_job is not None:
        # Alignment must happen after chunking
        chunk_job.addFollowOn(align_job)
    else:
        # Alignment can happen now
        job.addChild(align_job)

    return align_job.rv()
Example #55
0
 def __init__(self, inputFileID, failFileID):
     Job.__init__(self,  memory=100000, cores=1, disk="1M")
     self.inputFileID = inputFileID
     self.failFileID = failFileID
Example #56
0
def main_batch():
    """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine.
    pros: much less chance of a problem with one chromosome affecting anything else
          more forgiving for inexact resource specs
          could be ported to Terra
    cons: less efficient use of resources
    """
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("chromFile", help="chroms file")
    parser.add_argument("outHal",
                        type=str,
                        help="Output directory (can be s3://)")
    parser.add_argument(
        "--alignOptions",
        type=str,
        help=
        "Options to pass through to cactus-align (don't forget to wrap in quotes)"
    )
    parser.add_argument("--alignCores",
                        type=int,
                        help="Number of cores per align job")
    parser.add_argument(
        "--alignCoresOverrides",
        nargs="*",
        help=
        "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected"
    )

    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))

    options = parser.parse_args()

    options.containerImage = None
    options.binariesMode = None
    options.root = None
    options.latest = None
    options.database = "kyoto_tycoon"

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # Turn the overrides into a dict
    cores_overrides = {}
    if options.alignCoresOverrides:
        for o in options.alignCoresOverrides:
            try:
                chrom, cores = o.split(',')
                cores_overrides[chrom] = int(cores)
            except:
                raise RuntimeError(
                    "Error parsing alignCoresOverrides \"{}\"".format(o))
    options.alignCoresOverrides = cores_overrides

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            config_id = toil.importFile(makeURL(options.configFile))
            # load the chromfile into memory
            chrom_dict = {}
            with open(options.chromFile, 'r') as chrom_file:
                for line in chrom_file:
                    toks = line.strip().split()
                    if len(toks):
                        assert len(toks) == 3
                        chrom, seqfile, alnFile = toks[0], toks[1], toks[2]
                        chrom_dict[chrom] = toil.importFile(
                            makeURL(seqfile)), toil.importFile(
                                makeURL(alnFile))
            results_dict = toil.start(
                Job.wrapJobFn(align_toil_batch, chrom_dict, config_id,
                              options))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
                    toil.exportFile(
                        results[3],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal.log'.format(chrom))))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info(
        "cactus-align-batch has finished after {} seconds".format(run_time))
Example #57
0
    job.fileStore.readGlobalFile(output_file, userPath=os.path.join(outputs_dir, "sample_" + output_num + "_" + output_filename))
    return output_file


if __name__ == "__main__":
    options = Job.Runner.getDefaultOptions("./toilWorkflowRun")
    options.logLevel = "INFO"
    options.clean = "always"
    with Toil(options) as toil:

        # specify the folder where the cwl and yml files live
        inputs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cwlExampleFiles")
        # specify where you wish the outputs to be written
        outputs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cwlExampleFiles")

        job0 = Job.wrapJobFn(initialize_jobs)

        cwl_filename = "hello.cwl"
        cwl_file = toil.importFile("file://" + os.path.abspath(os.path.join(inputs_dir, cwl_filename)))

        # add list of yml config inputs here or import and construct from file
        yml_files = ["hello1.yml", "hello2.yml", "hello3.yml"]
        i = 0
        for yml in yml_files:
            i = i + 1
            yml_file = toil.importFile("file://" + os.path.abspath(os.path.join(inputs_dir, yml)))
            yml_filename = yml
            job = Job.wrapJobFn(runQC, cwl_file, cwl_filename, yml_file, yml_filename, outputs_dir, output_num=str(i))
            job0.addChild(job)

        toil.start(job0)
Example #58
0
def workerScript(jobStore,
                 config,
                 jobName,
                 jobStoreID,
                 redirectOutputToLogFile=True):
    """
    Worker process script, runs a job. 
    
    :param str jobName: The "job name" (a user friendly name) of the job to be run
    :param str jobStoreLocator: Specifies the job store to use
    :param str jobStoreID: The job store ID of the job to be run
    
    :return int: 1 if a job failed, or 0 if all jobs succeeded
    """
    logging.basicConfig()
    setLogLevel(config.logLevel)

    ##########################################
    #Create the worker killer, if requested
    ##########################################

    logFileByteReportLimit = config.maxLogFileSize

    if config.badWorker > 0 and random.random() < config.badWorker:
        # We need to kill the process we are currently in, to simulate worker
        # failure. We don't want to just send SIGKILL, because we can't tell
        # that from a legitimate OOM on our CI runner. We're going to send
        # SIGUSR1 so our terminations are distinctive, and then SIGKILL if that
        # didn't stick. We definitely don't want to do this from *within* the
        # process we are trying to kill, so we fork off. TODO: We can still
        # leave the killing code running after the main Toil flow is done, but
        # since it's now in a process instead of a thread, the main Python
        # process won't wait around for its timeout to expire. I think this is
        # better than the old thread-based way where all of Toil would wait
        # around to be killed.

        killTarget = os.getpid()
        sleepTime = config.badWorkerFailInterval * random.random()
        if os.fork() == 0:
            # We are the child
            # Let the parent run some amount of time
            time.sleep(sleepTime)
            # Kill it gently
            os.kill(killTarget, signal.SIGUSR1)
            # Wait for that to stick
            time.sleep(0.01)
            try:
                # Kill it harder. Hope the PID hasn't already been reused.
                # If we succeeded the first time, this will OSError
                os.kill(killTarget, signal.SIGKILL)
            except OSError:
                pass
            # Exit without doing any of Toil's cleanup
            os._exit(0)

        # We don't need to reap the child. Either it kills us, or we finish
        # before it does. Either way, init will have to clean it up for us.

    ##########################################
    #Load the environment for the jobGraph
    ##########################################

    #First load the environment for the jobGraph.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = safeUnpickleFromStream(fileHandle)
    env_blacklist = {
        "TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE", "HOME", "LOGNAME", "USER",
        "DISPLAY", "JAVA_HOME"
    }
    for i in environment:
        if i == "PATH":
            # Handle path specially. Sometimes e.g. leader may not include
            # /bin, but the Toil appliance needs it.
            if i in os.environ and os.environ[i] != '':
                # Use the provided PATH and then the local system's PATH
                os.environ[i] = environment[i] + ':' + os.environ[i]
            else:
                # Use the provided PATH only
                os.environ[i] = environment[i]
        elif i not in env_blacklist:
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID,
                                               config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################

    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0o755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    # Do we even want to redirect output? Let the config make us not do it.
    redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    if redirectOutputToLogFile:
        # Announce that we are redirecting logging, and where it will now go.
        # This is important if we are trying to manually trace a faulty worker invocation.
        logger.info("Redirecting logging to %s", tempWorkerLogPath)
        sys.stdout.flush()
        sys.stderr.flush()

        # Save the original stdout and stderr (by opening new file descriptors
        # to the same files)
        origStdOut = os.dup(1)
        origStdErr = os.dup(2)

        # Open the file to send stdout/stderr to.
        logFh = os.open(tempWorkerLogPath,
                        os.O_WRONLY | os.O_CREAT | os.O_APPEND)

        # Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        # Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)

        # Since we only opened the file once, all the descriptors duped from
        # the original will share offset information, and won't clobber each
        # others' writes. See <http://stackoverflow.com/a/5284108/402891>. This
        # shouldn't matter, since O_APPEND seeks to the end of the file before
        # every write, but maybe there's something odd going on...

        # Close the descriptor we used to open the file
        os.close(logFh)

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    jobAttemptFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda: True
    listOfJobs = [jobName]
    job = None
    try:

        #Put a message at the top of the log, just to make sure it's working.
        logger.info("---TOIL WORKER OUTPUT LOG---")
        sys.stdout.flush()

        logProcessContext(config)

        ##########################################
        #Connect to the deferred function system
        ##########################################
        deferredFunctionManager = DeferredFunctionManager(toilWorkflowDir)

        ##########################################
        #Load the jobGraph
        ##########################################

        jobGraph = jobStore.load(jobStoreID)
        listOfJobs[0] = str(jobGraph)
        logger.debug("Parsed job wrapper")

        ##########################################
        #Cleanup from any earlier invocation of the jobGraph
        ##########################################

        if jobGraph.command == None:
            logger.debug("Wrapper has no user job to run.")
            # Cleanup jobs already finished
            f = lambda jobs: [
                z for z in [[y for y in x if jobStore.exists(y.jobStoreID)]
                            for x in jobs] if len(z) > 0
            ]
            jobGraph.stack = f(jobGraph.stack)
            jobGraph.services = f(jobGraph.services)
            logger.debug(
                "Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may
        #have been left if the job is being retried after a job failure.
        oldLogFile = jobGraph.logJobStoreFileID
        if oldLogFile != None:
            jobGraph.logJobStoreFileID = None
            jobStore.update(jobGraph)  #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobGraph.checkpoint != None:
            logger.debug("Job is a checkpoint")
            # If the checkpoint still has extant jobs in its
            # (flattened) stack and services, its subtree didn't
            # complete properly. We handle the restart of the
            # checkpoint here, removing its previous subtree.
            if len([i for l in jobGraph.stack
                    for i in l]) > 0 or len(jobGraph.services) > 0:
                logger.debug("Checkpoint has failed.")
                # Reduce the retry count
                assert jobGraph.remainingRetryCount >= 0
                jobGraph.remainingRetryCount = max(
                    0, jobGraph.remainingRetryCount - 1)
                jobGraph.restartCheckpoint(jobStore)
            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug(
                    "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
                )
                #Delete any remnant files
                list(
                    map(
                        jobStore.deleteFile,
                        list(
                            filter(jobStore.fileExists,
                                   jobGraph.checkpointFilesToDelete))))

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.stats:
            startClock = getTotalCpuTime()

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobGraph, if there is one
            ##########################################

            if jobGraph.command is not None:
                assert jobGraph.command.startswith("_toil ")
                logger.debug("Got a command to run: %s" % jobGraph.command)
                #Load the job
                job = Job._loadJob(jobGraph.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobGraph.checkpoint = jobGraph.command

                # Create a fileStore object for the job
                fileStore = AbstractFileStore.createFileStore(
                    jobStore,
                    jobGraph,
                    localWorkerTempDir,
                    blockFn,
                    caching=not config.disableCaching)
                with job._executor(jobGraph=jobGraph,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with deferredFunctionManager.open() as defer:
                        with fileStore.open(job):
                            # Get the next block function to wait on committing this job
                            blockFn = fileStore.waitForCommit

                            job._runner(jobGraph=jobGraph,
                                        jobStore=jobStore,
                                        fileStore=fileStore,
                                        defer=defer)

                            # When the job succeeds, start committing files immediately.
                            fileStore.startCommit(jobState=False)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

            else:
                #The command may be none, in which case
                #the jobGraph is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                logger.debug("No user job to run, so finishing")
                break

            if AbstractFileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobGraph within the worker
            ##########################################
            successorJobGraph = nextChainableJobGraph(jobGraph, jobStore)
            if successorJobGraph is None or config.disableChaining:
                # Can't chain any more jobs.
                # TODO: why don't we commit the last job's file store? Won't
                # its async uploads never necessarily finish?
                # If we do call startCommit here it messes with the job
                # itself and Toil thinks the job needs to run again.
                break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobGraph command and stack
            #into the current jobGraph object so that it can be run
            #as if it were a command that were part of the current jobGraph.
            #We can then delete the successor jobGraph in the jobStore, as it is
            #wholly incorporated into the current jobGraph.
            ##########################################

            # add the successor to the list of jobs run
            listOfJobs.append(str(successorJobGraph))

            #Clone the jobGraph and its stack
            jobGraph = copy.deepcopy(jobGraph)

            #Remove the successor jobGraph
            jobGraph.stack.pop()

            #Transplant the command and stack to the current jobGraph
            jobGraph.command = successorJobGraph.command
            jobGraph.stack += successorJobGraph.stack
            # include some attributes for better identification of chained jobs in
            # logging output
            jobGraph.unitName = successorJobGraph.unitName
            jobGraph.jobName = successorJobGraph.jobName
            assert jobGraph.memory >= successorJobGraph.memory
            assert jobGraph.cores >= successorJobGraph.cores

            #Build a fileStore to update the job
            fileStore = AbstractFileStore.createFileStore(
                jobStore,
                jobGraph,
                localWorkerTempDir,
                blockFn,
                caching=not config.disableCaching)

            #Update blockFn
            blockFn = fileStore.waitForCommit

            #Add successorJobGraph to those to be deleted
            fileStore.jobsToDelete.add(successorJobGraph.jobStoreID)

            #This will update the job once the previous job is done
            fileStore.startCommit(jobState=True)

            #Clone the jobGraph and its stack again, so that updates to it do
            #not interfere with this update
            jobGraph = copy.deepcopy(jobGraph)

            logger.debug("Starting the next job")

        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        if redirectOutputToLogFile:
            logger.info(
                "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
                localWorkerTempDir)

        logger.info(
            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
            time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except:  #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s",
                     socket.gethostname())
        AbstractFileStore._terminateEvent.set()

    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ##########################################

    blockFn()

    ##########################################
    #All the asynchronous worker/update threads must be finished now,
    #so safe to test if they completed okay
    ##########################################

    if AbstractFileStore._terminateEvent.isSet():
        jobGraph = jobStore.load(jobStoreID)
        jobGraph.setupJobAfterFailure(config)
        jobAttemptFailed = True
        if job and jobGraph.remainingRetryCount == 0:
            job._succeeded = False

    ##########################################
    #Cleanup
    ##########################################

    # Close the worker logging
    # Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    if redirectOutputToLogFile:
        # Flush at the OS level
        os.fsync(1)
        os.fsync(2)

        # Close redirected stdout and replace with the original standard output.
        os.dup2(origStdOut, 1)

        # Close redirected stderr and replace with the original standard error.
        os.dup2(origStdErr, 2)

        # sys.stdout and sys.stderr don't need to be modified at all. We don't
        # need to call redirectLoggerStreamHandlers since they still log to
        # sys.stderr

        # Close our extra handles to the original standard output and standard
        # error streams, so we don't leak file handles.
        os.close(origStdOut)
        os.close(origStdErr)

    # Now our file handles are in exactly the state they were in before.

    # Copy back the log file to the global dir, if needed.
    # Note that we work with bytes instead of characters so we can seek
    # relative to the end (since Python won't decode Unicode backward, or even
    # interpret seek offsets in characters for us). TODO: We may get invalid or
    # just different Unicode by breaking up a character at the boundary!
    if jobAttemptFailed and redirectOutputToLogFile:
        jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID(
            jobGraph.jobStoreID, cleanup=True)
        jobGraph.chainedJobs = listOfJobs
        with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, 'rb') as f:
                if os.path.getsize(
                        tempWorkerLogPath) > logFileByteReportLimit != 0:
                    if logFileByteReportLimit > 0:
                        f.seek(-logFileByteReportLimit,
                               2)  # seek to last tooBig bytes of file
                    elif logFileByteReportLimit < 0:
                        f.seek(logFileByteReportLimit,
                               0)  # seek to first tooBig bytes of file
                # Dump the possibly-invalid-Unicode bytes into the log file
                w.write(f.read())  # TODO load file using a buffer
        jobStore.update(jobGraph)

    elif ((debugging or (config.writeLogsFromAllJobs
                         and not jobName.startswith(CWL_INTERNAL_JOBS)))
          and redirectOutputToLogFile):  # write log messages
        with open(tempWorkerLogPath, 'rb') as logFile:
            if os.path.getsize(
                    tempWorkerLogPath) > logFileByteReportLimit != 0:
                if logFileByteReportLimit > 0:
                    logFile.seek(-logFileByteReportLimit,
                                 2)  # seek to last tooBig bytes of file
                elif logFileByteReportLimit < 0:
                    logFile.seek(logFileByteReportLimit,
                                 0)  # seek to first tooBig bytes of file
            # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
            # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
            logMessages = [
                line.decode('utf-8', 'skip')
                for line in logFile.read().splitlines()
            ]
        statsDict.logs.names = listOfJobs
        statsDict.logs.messages = logMessages

    if (debugging or config.stats or statsDict.workers.logsToMaster
        ) and not jobAttemptFailed:  # We have stats/logging to report back
        if USING_PYTHON2:
            jobStore.writeStatsAndLogging(
                json.dumps(statsDict, ensure_ascii=True))
        else:
            jobStore.writeStatsAndLogging(
                json.dumps(statsDict, ensure_ascii=True).encode())

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and
                               not jobAttemptFailed) or (cleanUp == 'onError'
                                                         and jobAttemptFailed):
        shutil.rmtree(localWorkerTempDir)

    #This must happen after the log file is done with, else there is no place to put the log
    if (not jobAttemptFailed) and jobGraph.command == None and len(
            jobGraph.stack) == 0 and len(jobGraph.services) == 0:
        # We can now safely get rid of the jobGraph
        jobStore.delete(jobGraph.jobStoreID)

    if jobAttemptFailed:
        return 1
    else:
        return 0
Example #59
0
def main(options=None):
    if not options:
        # deal with command line arguments
        parser = ArgumentParser()
        Job.Runner.addToilOptions(parser)
        parser.add_argument('--numLines',
                            default=defaultLines,
                            help='Number of lines in file to sort.',
                            type=int)
        parser.add_argument('--lineLength',
                            default=defaultLineLen,
                            help='Length of lines in file to sort.',
                            type=int)
        parser.add_argument("--fileToSort", help="The file you wish to sort")
        parser.add_argument("--outputFile",
                            help="Where the sorted output will go")
        parser.add_argument(
            "--overwriteOutput",
            help="Write over the output file if it already exists.",
            default=True)
        parser.add_argument(
            "--N",
            dest="N",
            help=
            "The threshold below which a serial sort function is used to sort file. "
            "All lines must of length less than or equal to N or program will fail",
            default=10000)
        parser.add_argument(
            '--downCheckpoints',
            action='store_true',
            help=
            'If this option is set, the workflow will make checkpoints on its way through'
            'the recursive "down" part of the sort')
        parser.add_argument(
            "--sortMemory",
            dest="sortMemory",
            help="Memory for jobs that sort chunks of the file.",
            default=None)

        parser.add_argument("--mergeMemory",
                            dest="mergeMemory",
                            help="Memory for jobs that collate results.",
                            default=None)

        options = parser.parse_args()
    if not hasattr(options, "sortMemory") or not options.sortMemory:
        options.sortMemory = sortMemory
    if not hasattr(options, "mergeMemory") or not options.mergeMemory:
        options.mergeMemory = sortMemory

    # do some input verification
    sortedFileName = options.outputFile or "sortedFile.txt"
    if not options.overwriteOutput and os.path.exists(sortedFileName):
        print(
            "the output file {} already exists. Delete it to run the sort example again or use --overwriteOutput=True"
            .format(sortedFileName))
        exit()

    fileName = options.fileToSort
    if options.fileToSort is None:
        # make the file ourselves
        fileName = 'fileToSort.txt'
        if os.path.exists(fileName):
            print("Sorting existing file: {}".format(fileName))
        else:
            print(
                'No sort file specified. Generating one automatically called: {}.'
                .format(fileName))
            makeFileToSort(fileName=fileName,
                           lines=options.numLines,
                           lineLen=options.lineLength)
    else:
        if not os.path.exists(options.fileToSort):
            raise RuntimeError("File to sort does not exist: %s" %
                               options.fileToSort)

    if int(options.N) <= 0:
        raise RuntimeError("Invalid value of N: %s" % options.N)

    # Now we are ready to run
    with Toil(options) as workflow:
        sortedFileURL = 'file://' + os.path.abspath(sortedFileName)
        if not workflow.options.restart:
            sortFileURL = 'file://' + os.path.abspath(fileName)
            sortFileID = workflow.importFile(sortFileURL)
            sortedFileID = workflow.start(
                Job.wrapJobFn(setup,
                              sortFileID,
                              int(options.N),
                              options.downCheckpoints,
                              options=options,
                              memory=sortMemory))
        else:
            sortedFileID = workflow.restart()
        workflow.exportFile(sortedFileID, sortedFileURL)
Example #60
0
 def testUnicodeSupport(self):
     options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
     options.clean = 'always'
     options.logLevel = 'debug'
     Job.Runner.startToil(Job.wrapFn(printUnicodeCharacter), options)