def run(self): ########################################## #Setup a file tree. ########################################## tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString())) fileTreeRootFile = tempFileTree.getTempFile() makeFileTree(fileTreeRootFile, \ self.depth, tempFileTree) treePointer = tempFileTree.getTempFile() makeTreePointer(fileTreeRootFile, treePointer) logger.info("We've set up the file tree") if random.random() > 0.5: raise RuntimeError() ########################################## #Issue the child and follow on jobs ########################################## self.addChildTarget(ChildTarget(treePointer)) self.setFollowOnTarget(DestructFileTree(tempFileTree)) logger.info("We've added the child target and finished SetupFileTree.run()")
def run(self): tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults")) #Make the list of blast jobs. for i in xrange(0, len(self.chunks)): for j in xrange(i+1, len(self.chunks)): resultsFile = tempFileTree.getTempFile() self.resultsFiles.append(resultsFile) self.addChildTarget(RunBlast(self.blastOptions, self.chunks[i], self.chunks[j], resultsFile)) logger.info("Made the list of all-against-all blasts") #Set up the job to collate all the results self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, self.resultsFiles))
def single_copy_wrapper(target, args): """ Main pipeline wrapper. Runs halSingleCopyRegionsExtract once for each region in the conserved_bed file. """ bed_recs = [x.split()[:3] for x in open(args.conserved_bed)] result_dir = target.getGlobalTempDir() result_tree = TempFileTree(result_dir) for chunk in grouper(bed_recs, 10): result_path = result_tree.getTempFile() target.addChildTargetFn(find_single_copy, args=(args, chunk, result_path)) target.setFollowOnTargetFn(cat_results, args=(args, result_tree.listFiles()))
def extract_maf_wrapper(target, args): """ Main pipeline wrapper. Calls out to hal2maf once for each region in args.conserved_bed """ accelerated_genomes = set(args.accelerated_genomes + [args.ref_genome]) outgroup_genomes = set(args.target_genomes) - accelerated_genomes bed_recs = [x.split() for x in open(args.conserved_bed)] result_dir = target.getGlobalTempDir() result_tree = TempFileTree(result_dir) for chunk in grouper(bed_recs, 50): result_path = result_tree.getTempFile() target.addChildTargetFn(extract_and_calculate, args=(args, chunk, accelerated_genomes, outgroup_genomes, result_path)) target.setFollowOnTargetFn(cat_results, args=(args, result_tree.listFiles()))
def main_hints_fn(target, bam_paths, db_path, genome, genome_fasta, hints_dir): """ Main driver function. Loops over each BAM, inferring paired-ness, then passing each BAM with one chromosome name for filtering. Each BAM will remain separated until the final concatenation and sorting of the hint gffs. """ filtered_bam_tree = TempFileTree(get_tmp(target, global_dir=True, name="filter_file_tree")) for bam_path in bam_paths: paired = "--paired --pairwiseAlignments" if bam_is_paired(bam_path) is True else "" sam_handle = pysam.Samfile(bam_path) for references in group_references(sam_handle): out_filter = filtered_bam_tree.getTempFile(suffix=".bam") target.addChildTargetFn(sort_by_name, memory=8 * 1024 ** 3, cpu=2, args=[bam_path, references, out_filter, paired]) target.setFollowOnTargetFn(build_hints, args=[filtered_bam_tree, genome, db_path, genome_fasta, hints_dir])
def dless_wrapper(target, args, split_ss_dict): """ Wrapper for dless function. """ split_ss_dict = read_subalignment_dir(split_ss_path) output_gff_tree = TempFileTree( os.path.join(target.getGlobalTempDir(), 'output_gff')) for chromosome, split_ss_dir in split_ss_dict.iteritems(): for split_ss in os.listdir(split_ss_dir): gff_path = output_gff_tree.getTempFile(suffix=split_ss + '.gff') split_ss_path = os.path.join(split_ss_dir, split_ss) target.addChildTargetFn(dless, args=(split_ss_path, gff_path, args.model)) target.setFollowOnTargetFn(cat_dless, args=(args, output_gff_tree))
def run(self): chunks1 = self.getChunks(self.sequenceFiles1, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks1"))) chunks2 = self.getChunks(self.sequenceFiles2, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks2"))) tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults")) resultsFiles = [] #Make the list of blast jobs. for chunk1 in chunks1: for chunk2 in chunks2: resultsFile = tempFileTree.getTempFile() resultsFiles.append(resultsFile) #TODO: Make the compression work self.blastOptions.compressFiles = False self.addChildTarget(RunBlast(self.blastOptions, chunk1, chunk2, resultsFile)) logger.info("Made the list of blasts") #Set up the job to collate all the results self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, resultsFiles))
def setUp(self): unittest.TestCase.setUp(self) self.testNo = TestStatus.getTestSetup(1, 1, 2, 2) self.depth = TestStatus.getTestSetup(1, 2, 3, 5) self.jobTreeDir = os.path.join(os.getcwd(), "jobTree") #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from
def setUp(self): unittest.TestCase.setUp(self) self.testNo = TestStatus.getTestSetup(1, 1, 5, 5) self.depth = TestStatus.getTestSetup(1, 2, 2, 3) self.jobTreeDir = os.getcwd() + "/jobTree" #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from parasolRestart()
def align_augustus(target, genome, ref_fasta, target_fasta, target_fasta_index, out_db): file_tree = TempFileTree(target.getGlobalTempDir()) tgt_ids = [x.split()[0] for x in open(target_fasta_index)] for chunk in grouper(tgt_ids, 250): target.addChildTargetFn( align, args=[target_fasta, chunk, ref_fasta, file_tree]) target.setFollowOnTargetFn(cat, args=(genome, file_tree, out_db))
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.testNo = TestStatus.getTestSetup(1, 1, 5, 5) self.depth = TestStatus.getTestSetup(1, 2, 2, 3) self.jobTreeDir = os.getcwd() + "/jobTree" #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from parasolRestart() def tearDown(self): unittest.TestCase.tearDown(self) self.tempFileTree.destroyTempFiles() parasolStop() parasolRestart() system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up. def testJobTree_Parasol(self): """Runs a test program using the job tree, whilst constantly restarting parasol by killing the nodes. """ for test in xrange(self.testNo): #Does not run this test when doing short testing jobTreeCommand, fileTreeRootFile = setupJobTree(self.tempFileTree, self.jobTreeDir, "parasol", depth=self.depth) jobTreeCommand += " --rescueJobsFrequency 20" #Run the job parasolAndMasterKiller = ParasolAndMasterKiller() parasolAndMasterKiller.start() while True: while True: process = subprocess.Popen(jobTreeCommand, shell=True) sts = os.waitpid(process.pid, 0) if sts[1] == 0: logger.info("The job tree master ended, with an okay exit value (using parasol)") break else: logger.info("The job tree master ended with an error exit value, restarting: %i" % sts[1]) if checkEndStateOfJobTree(self.jobTreeDir): #Check the state of the job files break jobTreeCommand = "jobTreeRun --jobTree %s --logDebug" % self.jobTreeDir checkFileTreeCounts(fileTreeRootFile) os.system("rm -rf %s" % self.jobTreeDir) parasolAndMasterKiller.stopKilling() logger.info("Test done okay")
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.jobTreeDir = os.path.join(os.getcwd(), "testJobTree") #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from def tearDown(self): unittest.TestCase.tearDown(self) self.tempFileTree.destroyTempFiles() system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up. # only done in singleMachine for now. Experts can run manually on other systems if they choose def dependenciesTest(self, batchSystem="singleMachine", furtherOptionsString=""): def fn(tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime): system("rm -rf %s" % self.jobTreeDir) logName = self.tempFileTree.getTempFile(suffix="_comblog.txt", makeDir=False) commandLine = "jobTreeTest_Dependencies.py --jobTree %s --logFile %s --batchSystem '%s' --tree %s --maxCpus %s --maxThreads %s --size %s --cpusPerJob=%s --sleepTime %s %s" % \ (self.jobTreeDir, logName, batchSystem, tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime, furtherOptionsString) system(commandLine) fn("comb", 10, 100, 100, 1, 10) fn("comb", 200, 100, 100, 20, 10) fn("fly", 10, 8, 100, 1, 10) fn("fly", 10, 8, 100, 2, 10) fn("balanced", 5, 10, 100, 1, 10) fn("balanced", 5, 10, 100, 3, 10) def testJobTree_dependencies_singleMachine(self): self.dependenciesTest(batchSystem="singleMachine") def testJobTree_dependencies_combined(self): self.dependenciesTest(batchSystem="singleMachine", furtherOptionsString="--bigBatchSystem singleMachine --bigMemoryThreshold 1000000") def testJobTree_dependencies_parasol(self): return if parasolIsInstalled(): self.dependenciesTest(batchSystem="parasol") def testJobTree_dependencies_gridengine(self): return if gridEngineIsInstalled(): self.dependenciesTest(batchSystem="gridengine")
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.testNo = TestStatus.getTestSetup(1, 1, 2, 2) self.depth = TestStatus.getTestSetup(1, 2, 3, 5) self.jobTreeDir = os.path.join(os.getcwd(), "jobTree") #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from def tearDown(self): unittest.TestCase.tearDown(self) self.tempFileTree.destroyTempFiles() system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up. def testJobTree_SingleMachine(self): testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "singleMachine") def testJobTree_Parasol(self): if parasolIsInstalled(): testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "parasol") def testJobTree_gridengine(self): if gridEngineIsInstalled(): testJobTree(self.testNo, self.depth, self.tempFileTree, self.jobTreeDir, "gridengine") def testJobTree_dependencies(self): commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree comb --maxThreads 100" % self.jobTreeDir os.system("rm -rf %s" % self.jobTreeDir) system(commandLine) commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree fly --maxThreads 100" % self.jobTreeDir os.system("rm -rf %s" % self.jobTreeDir) system(commandLine) os.system("rm -rf %s" % self.jobTreeDir) commandLine = "jobTreeTest_Dependencies.py --jobTree %s --tree balanced --maxThreads 100" % self.jobTreeDir system(commandLine)
def align_gp(target, genome, ref_genome, ref_tx_fasta, target_genome_fasta, gp, mode, out_db, comp_ann_path, chunk_size): """ Initial wrapper job. Constructs a file tree and starts alignment job batches in groups of chunk_size. Follow on: concatenates file tree. """ file_tree = TempFileTree(target.getGlobalTempDir()) for recs in grouper(open(gp), chunk_size): target.addChildTargetFn(align_wrapper, args=[ recs, file_tree, ref_tx_fasta, target_genome_fasta, comp_ann_path, ref_genome, mode ]) target.setFollowOnTargetFn(cat, args=[genome, file_tree, out_db, mode])
def build_analyses(target, ref_genome, genome, annotation_gp, psl, gp, aug_gp, fasta, ref_fasta, sizes, gencode_attributes, out_dir): # find all user-defined classes in the categories of analyses out_file_tree = TempFileTree(target.getGlobalTempDir()) classifiers = classes_in_module(src.augustus_classifiers) for classifier in classifiers: target.addChildTarget( classifier(genome, psl, fasta, ref_fasta, annotation_gp, gencode_attributes, gp, ref_genome, out_file_tree, aug_gp)) # merge the resulting pickled files into sqlite databases and construct BED tracks target.setFollowOnTargetFn(database, memory=8 * (1024**3), args=(out_dir, genome, psl, sizes, gp, annotation_gp, out_file_tree))
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path): """ Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus instance. This seems to be a good time per job to me. """ # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance out_file_tree = TempFileTree(target.getGlobalTempDir()) unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10)) for line in open(input_gp): target.addChildTargetFn( transmap_2_aug, args=[line, genome, sizes_path, fasta_path, out_file_tree]) target.setFollowOnTargetFn( cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path, hints_db): """ Produces one jobTree target per genePred entry. """ # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance out_file_tree = TempFileTree(target.getGlobalTempDir()) # this file will be where we reduce the final results to before sorting unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10)) for line in open(input_gp): target.addChildTargetFn(transmap_2_aug, memory=8 * (1024**3), args=[ line, genome, sizes_path, fasta_path, out_file_tree, hints_db ]) target.setFollowOnTargetFn( cat, args=[output_gtf, unsorted_tmp_file, out_file_tree])
def build_hints(target, filtered_bam_tree, genome, db_path, genome_fasta, hints_dir): """ Driver function for hint building. Builts intron and exon hints, then calls cat_hints to do final concatenation and sorting. """ bam_files = [x for x in filtered_bam_tree.listFiles() if x.endswith("bam")] intron_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="intron_hints_tree")) exon_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="exon_hints_tree")) for bam_file in bam_files: intron_hints_path = intron_hints_tree.getTempFile(suffix=".intron.gff") target.addChildTargetFn(build_intron_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, intron_hints_path]) exon_hints_path = exon_hints_tree.getTempFile(suffix=".exon.gff") target.addChildTargetFn(build_exon_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, exon_hints_path]) target.setFollowOnTargetFn(cat_hints, args=[intron_hints_tree, exon_hints_tree, genome, db_path, genome_fasta, hints_dir])
def testTempFileTree(self): for test in range(100): #self.testNo): levels = random.choice(range(1, 4)) fileNo = random.choice(range(1, 6)) maxTempFiles = int(math.pow(fileNo, levels)) print("Got %s levels, %s fileNo and %s maxTempFiles" % (levels, fileNo, maxTempFiles)) tempFileTreeRootDir = os.path.join(self.tempDir, getRandomAlphaNumericString()) tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels) tempFiles = [] tempDirs = [] #Check we can mac number of temp files. for i in range(maxTempFiles): if random.random() > 0.5: tempFile = tempFileTree.getTempFile() assert os.path.isfile(tempFile) tempFiles.append(tempFile) else: tempFile = tempFileTree.getTempDirectory() assert os.path.isdir(tempFile) tempDirs.append(tempFile) #Check assertion is created try: tempFileTree.getTempFile() assert False except RuntimeError: logger.debug("Got expected error message") #Now remove a few temp files while random.random() > 0.1 and len(tempFiles) > 0: tempFile = tempFiles.pop() assert os.path.isfile(tempFile) tempFileTree.destroyTempFile(tempFile) assert not os.path.isfile(tempFile) #Now remove a few temp dirs while random.random() > 0.1 and len(tempDirs) > 0: tempDir = tempDirs.pop() assert os.path.isdir(tempDir) tempFileTree.destroyTempDir(tempDir) assert not os.path.isdir(tempDir) #Check temp files is okay set(tempFileTree.listFiles()) == set(tempFiles + tempDirs) #Either remove all the temp files or just destroy the whole thing if random.random() > 0.5: #Remove all temp files and check thing is empty. for tempFile in tempFiles: tempFileTree.destroyTempFile(tempFile) for tempDir in tempDirs: tempFileTree.destroyTempDir(tempDir) os.remove(os.path.join(tempFileTreeRootDir, "lock")) os.rmdir(tempFileTreeRootDir) else: tempFileTree.destroyTempFiles() assert not os.path.isdir(tempFileTreeRootDir)
def setUp(self): unittest.TestCase.setUp(self) self.jobTreeDir = os.path.join(os.getcwd(), "testJobTree") #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from