def run(self): ########################################## #Setup a file tree. ########################################## tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString())) fileTreeRootFile = tempFileTree.getTempFile() makeFileTree(fileTreeRootFile, \ self.depth, tempFileTree) treePointer = tempFileTree.getTempFile() makeTreePointer(fileTreeRootFile, treePointer) logger.info("We've set up the file tree") if random.random() > 0.5: raise RuntimeError() ########################################## #Issue the child and follow on jobs ########################################## self.addChildTarget(ChildTarget(treePointer)) self.setFollowOnTarget(DestructFileTree(tempFileTree)) logger.info("We've added the child target and finished SetupFileTree.run()")
def build_hints(target, filtered_bam_tree, genome, db_path, genome_fasta, hints_dir): """ Driver function for hint building. Builts intron and exon hints, then calls cat_hints to do final concatenation and sorting. """ bam_files = [x for x in filtered_bam_tree.listFiles() if x.endswith("bam")] intron_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="intron_hints_tree")) exon_hints_tree = TempFileTree(get_tmp(target, global_dir=True, name="exon_hints_tree")) for bam_file in bam_files: intron_hints_path = intron_hints_tree.getTempFile(suffix=".intron.gff") target.addChildTargetFn(build_intron_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, intron_hints_path]) exon_hints_path = exon_hints_tree.getTempFile(suffix=".exon.gff") target.addChildTargetFn(build_exon_hints, memory=8 * 1024 ** 3, cpu=2, args=[bam_file, exon_hints_path]) target.setFollowOnTargetFn(cat_hints, args=[intron_hints_tree, exon_hints_tree, genome, db_path, genome_fasta, hints_dir])
def run(self): tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults")) #Make the list of blast jobs. for i in xrange(0, len(self.chunks)): for j in xrange(i+1, len(self.chunks)): resultsFile = tempFileTree.getTempFile() self.resultsFiles.append(resultsFile) self.addChildTarget(RunBlast(self.blastOptions, self.chunks[i], self.chunks[j], resultsFile)) logger.info("Made the list of all-against-all blasts") #Set up the job to collate all the results self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, self.resultsFiles))
def single_copy_wrapper(target, args): """ Main pipeline wrapper. Runs halSingleCopyRegionsExtract once for each region in the conserved_bed file. """ bed_recs = [x.split()[:3] for x in open(args.conserved_bed)] result_dir = target.getGlobalTempDir() result_tree = TempFileTree(result_dir) for chunk in grouper(bed_recs, 10): result_path = result_tree.getTempFile() target.addChildTargetFn(find_single_copy, args=(args, chunk, result_path)) target.setFollowOnTargetFn(cat_results, args=(args, result_tree.listFiles()))
def extract_maf_wrapper(target, args): """ Main pipeline wrapper. Calls out to hal2maf once for each region in args.conserved_bed """ accelerated_genomes = set(args.accelerated_genomes + [args.ref_genome]) outgroup_genomes = set(args.target_genomes) - accelerated_genomes bed_recs = [x.split() for x in open(args.conserved_bed)] result_dir = target.getGlobalTempDir() result_tree = TempFileTree(result_dir) for chunk in grouper(bed_recs, 50): result_path = result_tree.getTempFile() target.addChildTargetFn(extract_and_calculate, args=(args, chunk, accelerated_genomes, outgroup_genomes, result_path)) target.setFollowOnTargetFn(cat_results, args=(args, result_tree.listFiles()))
def dless_wrapper(target, args, split_ss_dict): """ Wrapper for dless function. """ split_ss_dict = read_subalignment_dir(split_ss_path) output_gff_tree = TempFileTree( os.path.join(target.getGlobalTempDir(), 'output_gff')) for chromosome, split_ss_dir in split_ss_dict.iteritems(): for split_ss in os.listdir(split_ss_dir): gff_path = output_gff_tree.getTempFile(suffix=split_ss + '.gff') split_ss_path = os.path.join(split_ss_dir, split_ss) target.addChildTargetFn(dless, args=(split_ss_path, gff_path, args.model)) target.setFollowOnTargetFn(cat_dless, args=(args, output_gff_tree))
def main_hints_fn(target, bam_paths, db_path, genome, genome_fasta, hints_dir): """ Main driver function. Loops over each BAM, inferring paired-ness, then passing each BAM with one chromosome name for filtering. Each BAM will remain separated until the final concatenation and sorting of the hint gffs. """ filtered_bam_tree = TempFileTree(get_tmp(target, global_dir=True, name="filter_file_tree")) for bam_path in bam_paths: paired = "--paired --pairwiseAlignments" if bam_is_paired(bam_path) is True else "" sam_handle = pysam.Samfile(bam_path) for references in group_references(sam_handle): out_filter = filtered_bam_tree.getTempFile(suffix=".bam") target.addChildTargetFn(sort_by_name, memory=8 * 1024 ** 3, cpu=2, args=[bam_path, references, out_filter, paired]) target.setFollowOnTargetFn(build_hints, args=[filtered_bam_tree, genome, db_path, genome_fasta, hints_dir])
def run(self): chunks1 = self.getChunks(self.sequenceFiles1, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks1"))) chunks2 = self.getChunks(self.sequenceFiles2, makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks2"))) tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), "allAgainstAllResults")) resultsFiles = [] #Make the list of blast jobs. for chunk1 in chunks1: for chunk2 in chunks2: resultsFile = tempFileTree.getTempFile() resultsFiles.append(resultsFile) #TODO: Make the compression work self.blastOptions.compressFiles = False self.addChildTarget(RunBlast(self.blastOptions, chunk1, chunk2, resultsFile)) logger.info("Made the list of blasts") #Set up the job to collate all the results self.setFollowOnTarget(CollateBlasts(self.finalResultsFile, resultsFiles))
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.jobTreeDir = os.path.join(os.getcwd(), "testJobTree") #A directory for the job tree to be created in self.tempFileTreeDir = os.path.join(os.getcwd(), "tempFileTree") #Ensures that file tree is visible self.tempFileTree = TempFileTree(self.tempFileTreeDir) #A place to get temp files from def tearDown(self): unittest.TestCase.tearDown(self) self.tempFileTree.destroyTempFiles() system("rm -rf %s %s" % (self.jobTreeDir, self.tempFileTreeDir)) #Cleanup the job tree in case it hasn't already been cleaned up. # only done in singleMachine for now. Experts can run manually on other systems if they choose def dependenciesTest(self, batchSystem="singleMachine", furtherOptionsString=""): def fn(tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime): system("rm -rf %s" % self.jobTreeDir) logName = self.tempFileTree.getTempFile(suffix="_comblog.txt", makeDir=False) commandLine = "jobTreeTest_Dependencies.py --jobTree %s --logFile %s --batchSystem '%s' --tree %s --maxCpus %s --maxThreads %s --size %s --cpusPerJob=%s --sleepTime %s %s" % \ (self.jobTreeDir, logName, batchSystem, tree, maxCpus, maxThreads, size, cpusPerJob, sleepTime, furtherOptionsString) system(commandLine) fn("comb", 10, 100, 100, 1, 10) fn("comb", 200, 100, 100, 20, 10) fn("fly", 10, 8, 100, 1, 10) fn("fly", 10, 8, 100, 2, 10) fn("balanced", 5, 10, 100, 1, 10) fn("balanced", 5, 10, 100, 3, 10) def testJobTree_dependencies_singleMachine(self): self.dependenciesTest(batchSystem="singleMachine") def testJobTree_dependencies_combined(self): self.dependenciesTest(batchSystem="singleMachine", furtherOptionsString="--bigBatchSystem singleMachine --bigMemoryThreshold 1000000") def testJobTree_dependencies_parasol(self): return if parasolIsInstalled(): self.dependenciesTest(batchSystem="parasol") def testJobTree_dependencies_gridengine(self): return if gridEngineIsInstalled(): self.dependenciesTest(batchSystem="gridengine")
def testTempFileTree(self): for test in range(100): #self.testNo): levels = random.choice(range(1, 4)) fileNo = random.choice(range(1, 6)) maxTempFiles = int(math.pow(fileNo, levels)) print("Got %s levels, %s fileNo and %s maxTempFiles" % (levels, fileNo, maxTempFiles)) tempFileTreeRootDir = os.path.join(self.tempDir, getRandomAlphaNumericString()) tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels) tempFiles = [] tempDirs = [] #Check we can mac number of temp files. for i in range(maxTempFiles): if random.random() > 0.5: tempFile = tempFileTree.getTempFile() assert os.path.isfile(tempFile) tempFiles.append(tempFile) else: tempFile = tempFileTree.getTempDirectory() assert os.path.isdir(tempFile) tempDirs.append(tempFile) #Check assertion is created try: tempFileTree.getTempFile() assert False except RuntimeError: logger.debug("Got expected error message") #Now remove a few temp files while random.random() > 0.1 and len(tempFiles) > 0: tempFile = tempFiles.pop() assert os.path.isfile(tempFile) tempFileTree.destroyTempFile(tempFile) assert not os.path.isfile(tempFile) #Now remove a few temp dirs while random.random() > 0.1 and len(tempDirs) > 0: tempDir = tempDirs.pop() assert os.path.isdir(tempDir) tempFileTree.destroyTempDir(tempDir) assert not os.path.isdir(tempDir) #Check temp files is okay set(tempFileTree.listFiles()) == set(tempFiles + tempDirs) #Either remove all the temp files or just destroy the whole thing if random.random() > 0.5: #Remove all temp files and check thing is empty. for tempFile in tempFiles: tempFileTree.destroyTempFile(tempFile) for tempDir in tempDirs: tempFileTree.destroyTempDir(tempDir) os.remove(os.path.join(tempFileTreeRootDir, "lock")) os.rmdir(tempFileTreeRootDir) else: tempFileTree.destroyTempFiles() assert not os.path.isdir(tempFileTreeRootDir)