def write_hint_fasta(hint, seq, chrom, tmp_dir): """ Writes the hints and the seq to a file to be used by Augustus. """ hint_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".gff") seq_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".fa") with open(hint_f, "w") as hint_fh, open(seq_f, "w") as seq_fh: hint_fh.write(hint) seq_fh.write(">{}\n{}\n".format(chrom, seq)) return hint_f, seq_f
def write_hint_fasta(hint, seq, chrom, tmp_dir): """ Writes the hints and the seq to a file to be used by Augustus. """ hint_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".gff") seq_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".fa") with open(hint_f, "w") as hint_fh, open(seq_f, "w") as seq_fh: hint_fh.write(hint) seq_fh.write(">{}\n{}\n".format(chrom, seq)) return hint_f, seq_f
def run(self): ########################################## #Setup a file tree. ########################################## tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString())) fileTreeRootFile = tempFileTree.getTempFile() makeFileTree(fileTreeRootFile, \ self.depth, tempFileTree) treePointer = tempFileTree.getTempFile() makeTreePointer(fileTreeRootFile, treePointer) logger.info("We've set up the file tree") if random.random() > 0.5: raise RuntimeError() ########################################## #Issue the child and follow on jobs ########################################## self.addChildTarget(ChildTarget(treePointer)) self.setFollowOnTarget(DestructFileTree(tempFileTree)) logger.info("We've added the child target and finished SetupFileTree.run()")
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path): """ Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus instance. This seems to be a good time per job to me. """ # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance out_file_tree = TempFileTree(target.getGlobalTempDir()) unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10)) for line in open(input_gp): target.addChildTargetFn(transmap_2_aug, args=[line, genome, sizes_path, fasta_path, out_file_tree]) target.setFollowOnTargetFn(cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def main_fn(target, comp_ann_path, attr_path, ref_gp_path, gencode, genome, biotype, base_out_path, method): base_clust_title = "Hierarchical_clustering_of_transMap_classifiers" base_barplot_title = ( "Proportion of transcripts that fail transMap classifiers\ngenome: {}. {:,} ({:0.2f}%) not OK " "transcripts \nGencode set: {} Biotype: {}") out_path = os.path.join(base_out_path, biotype, "clustering", method, genome) con, cur = attach_databases(comp_ann_path) if biotype == "protein_coding": classifiers = tm_coding_classifiers coding = True else: classifiers = tm_noncoding_classifiers coding = False sql_data = load_data(con, genome, classifiers) filter_set, num_biotype = find_aln_id_set(cur, attr_path, ref_gp_path, genome, biotype, classifiers) if num_biotype > 25 and len(filter_set) > 10: percent_not_ok = round(100.0 * len(filter_set) / num_biotype, 2) if method == "pre_cluster": munged, stats = munge_data(sql_data, filter_set, pre_cluster=True, coding=coding) else: munged, stats = munge_data(sql_data, filter_set, pre_cluster=False, coding=coding) mkdir_p(out_path) barplot_title = base_barplot_title.format(genome, len(filter_set), percent_not_ok, gencode, biotype) out_barplot_file = os.path.join(out_path, "barplot{}_{}".format(genome, biotype)) barplot(stats, out_path, out_barplot_file, barplot_title) # TODO: why can't I use local temp? R fails inexplicably tmp_path = os.path.join(target.getGlobalTempDir(), "{}.txt".format(getRandomAlphaNumericString())) munged.to_csv(tmp_path) out_cluster_file = os.path.join( out_path, "clustering_{}_{}".format(genome, biotype)) # TODO: why do we have to use my R? system( "export R_HOME=/cluster/home/ifiddes/lib64/R && /cluster/home/ifiddes/bin/Rscript {}/scripts/cluster.R {} {} {} {} {} {} {} {}" .format(os.getcwd(), tmp_path, base_clust_title, genome, len(filter_set), percent_not_ok, gencode, biotype, out_cluster_file))
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path): """ Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus instance. This seems to be a good time per job to me. """ # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance out_file_tree = TempFileTree(target.getGlobalTempDir()) unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10)) for line in open(input_gp): target.addChildTargetFn( transmap_2_aug, args=[line, genome, sizes_path, fasta_path, out_file_tree]) target.setFollowOnTargetFn( cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transcript_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n") out_path = os.path.join(base_out_path, "clustering", ref_genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference") biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId") out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode)) barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode) munged, stats = munge_data(sql_data, biotype_ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, attr_path, ref_gp_path, gencode, genome, biotype, base_out_path, method): base_clust_title = "Hierarchical_clustering_of_transMap_classifiers" base_barplot_title = ( "Proportion of transcripts that fail transMap classifiers\ngenome: {}. {:,} ({:0.2f}%) not OK " "transcripts \nGencode set: {} Biotype: {}" ) out_path = os.path.join(base_out_path, biotype, "clustering", method, genome) con, cur = attach_databases(comp_ann_path) if biotype == "protein_coding": classifiers = tm_coding_classifiers coding = True else: classifiers = tm_noncoding_classifiers coding = False sql_data = load_data(con, genome, classifiers) filter_set, num_biotype = find_aln_id_set(cur, attr_path, ref_gp_path, genome, biotype, classifiers) if num_biotype > 25 and len(filter_set) > 10: percent_not_ok = round(100.0 * len(filter_set) / num_biotype, 2) if method == "pre_cluster": munged, stats = munge_data(sql_data, filter_set, pre_cluster=True, coding=coding) else: munged, stats = munge_data(sql_data, filter_set, pre_cluster=False, coding=coding) mkdir_p(out_path) barplot_title = base_barplot_title.format(genome, len(filter_set), percent_not_ok, gencode, biotype) out_barplot_file = os.path.join(out_path, "barplot{}_{}".format(genome, biotype)) barplot(stats, out_path, out_barplot_file, barplot_title) # TODO: why can't I use local temp? R fails inexplicably tmp_path = os.path.join(target.getGlobalTempDir(), "{}.txt".format(getRandomAlphaNumericString())) munged.to_csv(tmp_path) out_cluster_file = os.path.join(out_path, "clustering_{}_{}".format(genome, biotype)) # TODO: why do we have to use my R? system( "export R_HOME=/cluster/home/ifiddes/lib64/R && /cluster/home/ifiddes/bin/Rscript {}/scripts/cluster.R {} {} {} {} {} {} {} {}".format( os.getcwd(), tmp_path, base_clust_title, genome, len(filter_set), percent_not_ok, gencode, biotype, out_cluster_file, ) )
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path, hints_db): """ Produces one jobTree target per genePred entry. """ # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance out_file_tree = TempFileTree(target.getGlobalTempDir()) # this file will be where we reduce the final results to before sorting unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10)) for line in open(input_gp): target.addChildTargetFn(transmap_2_aug, memory=8 * (1024**3), args=[ line, genome, sizes_path, fasta_path, out_file_tree, hints_db ]) target.setFollowOnTargetFn( cat, args=[output_gtf, unsorted_tmp_file, out_file_tree])
def align(target, g, target_fasta, chunk, ref_fasta, out_path): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] for aug_aId in chunk: aId = remove_augustus_alignment_number(aug_aId) gencode_id = remove_alignment_number(aId) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[aug_aId]) tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode") fastaWrite(tmp_aug, aug_aId, aug_seq) fastaWrite(tmp_gencode, gencode_id, gencode_seq) r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug)) r = r.split("\n")[:-3] if len(r) == 0: results.append([aug_aId, "0", "0"]) else: p_list = [PslRow(x) for x in r] results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)])) with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf: for x in results: outf.write("\t".join(x) + "\n")
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_augustus_classifiers" base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n" "on the reference set {} with Augustus {}") out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus") highest_cov_dict = sql_lib.highest_cov_aln(cur, genome) highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", table="augustus") base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} for mode in ["1", "2"]: i = "I{}".format(mode) aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less" filter_set = {x for x in base_filter_set if i in x} out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i)) barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode) munged, stats = munge_data(sql_data, filter_set) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transMap_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n" "Genome: {}. Gencode set: {}. {:,} ({:0.2f}%) of transcripts") out_path = os.path.join(base_out_path, "classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap") fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype) biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers) num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes") for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]): mode_underscore = mode.replace("/", "_") out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore)) percentage_of_set = 100.0 * len(ids) / len(biotype_ids) barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), percentage_of_set) munged, stats = munge_intron_data(sql_data, num_original_introns, ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(xrange(30)) if avgSequenceLength is None: avgSequenceLength = random.choice(xrange(1,3000)) if treeLeafNumber is None: treeLeafNumber = random.choice(xrange(2, 4)) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in xrange(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: #for i in xrange(sequenceNumber): if sequenceFile == None: if random.random() > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] sequence = mutateSequence(parentSequence, distance=random.random()*0.5) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(list(range(30))) if avgSequenceLength is None: avgSequenceLength = random.choice(list(range(1, 3000))) if treeLeafNumber is None: treeLeafNumber = random.choice(list(range(2, 4))) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in range(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: if sequenceFile == None: if random.random( ) > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] sequence = mutateSequence(parentSequence, distance=random.random() * 0.25) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
def testTempFileTree(self): for test in range(100): #self.testNo): levels = random.choice(range(1, 4)) fileNo = random.choice(range(1, 6)) maxTempFiles = int(math.pow(fileNo, levels)) print("Got %s levels, %s fileNo and %s maxTempFiles" % (levels, fileNo, maxTempFiles)) tempFileTreeRootDir = os.path.join(self.tempDir, getRandomAlphaNumericString()) tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels) tempFiles = [] tempDirs = [] #Check we can mac number of temp files. for i in range(maxTempFiles): if random.random() > 0.5: tempFile = tempFileTree.getTempFile() assert os.path.isfile(tempFile) tempFiles.append(tempFile) else: tempFile = tempFileTree.getTempDirectory() assert os.path.isdir(tempFile) tempDirs.append(tempFile) #Check assertion is created try: tempFileTree.getTempFile() assert False except RuntimeError: logger.debug("Got expected error message") #Now remove a few temp files while random.random() > 0.1 and len(tempFiles) > 0: tempFile = tempFiles.pop() assert os.path.isfile(tempFile) tempFileTree.destroyTempFile(tempFile) assert not os.path.isfile(tempFile) #Now remove a few temp dirs while random.random() > 0.1 and len(tempDirs) > 0: tempDir = tempDirs.pop() assert os.path.isdir(tempDir) tempFileTree.destroyTempDir(tempDir) assert not os.path.isdir(tempDir) #Check temp files is okay set(tempFileTree.listFiles()) == set(tempFiles + tempDirs) #Either remove all the temp files or just destroy the whole thing if random.random() > 0.5: #Remove all temp files and check thing is empty. for tempFile in tempFiles: tempFileTree.destroyTempFile(tempFile) for tempDir in tempDirs: tempFileTree.destroyTempDir(tempDir) os.remove(os.path.join(tempFileTreeRootDir, "lock")) os.rmdir(tempFileTreeRootDir) else: tempFileTree.destroyTempFiles() assert not os.path.isdir(tempFileTreeRootDir)
def __init__(self, depth=0): Target.__init__(self, time=random.random() * 10) self.tempFileName = getRandomAlphaNumericString() self.depth = depth
def __init__(self, depth=0): Target.__init__(self, time=random.random() * 10) self.tempFileName = getRandomAlphaNumericString() self.depth = depth