Example #1
0
def write_hint_fasta(hint, seq, chrom, tmp_dir):
    """
    Writes the hints and the seq to a file to be used by Augustus.
    """
    hint_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".gff")
    seq_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".fa")
    with open(hint_f, "w") as hint_fh, open(seq_f, "w") as seq_fh:
        hint_fh.write(hint)
        seq_fh.write(">{}\n{}\n".format(chrom, seq))
    return hint_f, seq_f
def write_hint_fasta(hint, seq, chrom, tmp_dir):
    """
    Writes the hints and the seq to a file to be used by Augustus.
    """
    hint_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".gff")
    seq_f = os.path.join(tmp_dir, getRandomAlphaNumericString(10) + ".fa")
    with open(hint_f, "w") as hint_fh, open(seq_f, "w") as seq_fh:
        hint_fh.write(hint)
        seq_fh.write(">{}\n{}\n".format(chrom, seq))
    return hint_f, seq_f
 def run(self):
     ##########################################
     #Setup a file tree.
     ##########################################
         
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString()))   
     
     fileTreeRootFile = tempFileTree.getTempFile()
 
     makeFileTree(fileTreeRootFile, \
                  self.depth, tempFileTree)
     
     treePointer = tempFileTree.getTempFile()
     
     makeTreePointer(fileTreeRootFile, treePointer)
     
     logger.info("We've set up the file tree")
     
     if random.random() > 0.5:
         raise RuntimeError()
     
     ##########################################
     #Issue the child and follow on jobs
     ##########################################
     
     self.addChildTarget(ChildTarget(treePointer))
     
     self.setFollowOnTarget(DestructFileTree(tempFileTree))
     
     logger.info("We've added the child target and finished SetupFileTree.run()")
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path):
    """
    Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during
    initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus
    instance. This seems to be a good time per job to me.
    """
    # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    unsorted_tmp_file = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString(10))
    for line in open(input_gp):
        target.addChildTargetFn(transmap_2_aug, args=[line, genome, sizes_path, fasta_path, out_file_tree])
    target.setFollowOnTargetFn(cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def main_fn(target, comp_ann_path, attr_path, ref_gp_path, gencode, genome,
            biotype, base_out_path, method):
    base_clust_title = "Hierarchical_clustering_of_transMap_classifiers"
    base_barplot_title = (
        "Proportion of transcripts that fail transMap classifiers\ngenome: {}.    {:,} ({:0.2f}%) not OK "
        "transcripts \nGencode set: {}    Biotype: {}")
    out_path = os.path.join(base_out_path, biotype, "clustering", method,
                            genome)
    con, cur = attach_databases(comp_ann_path)
    if biotype == "protein_coding":
        classifiers = tm_coding_classifiers
        coding = True
    else:
        classifiers = tm_noncoding_classifiers
        coding = False
    sql_data = load_data(con, genome, classifiers)
    filter_set, num_biotype = find_aln_id_set(cur, attr_path, ref_gp_path,
                                              genome, biotype, classifiers)
    if num_biotype > 25 and len(filter_set) > 10:
        percent_not_ok = round(100.0 * len(filter_set) / num_biotype, 2)
        if method == "pre_cluster":
            munged, stats = munge_data(sql_data,
                                       filter_set,
                                       pre_cluster=True,
                                       coding=coding)
        else:
            munged, stats = munge_data(sql_data,
                                       filter_set,
                                       pre_cluster=False,
                                       coding=coding)
        mkdir_p(out_path)
        barplot_title = base_barplot_title.format(genome, len(filter_set),
                                                  percent_not_ok, gencode,
                                                  biotype)
        out_barplot_file = os.path.join(out_path,
                                        "barplot{}_{}".format(genome, biotype))
        barplot(stats, out_path, out_barplot_file, barplot_title)
        # TODO: why can't I use local temp? R fails inexplicably
        tmp_path = os.path.join(target.getGlobalTempDir(),
                                "{}.txt".format(getRandomAlphaNumericString()))
        munged.to_csv(tmp_path)
        out_cluster_file = os.path.join(
            out_path, "clustering_{}_{}".format(genome, biotype))
        # TODO: why do we have to use my R?
        system(
            "export R_HOME=/cluster/home/ifiddes/lib64/R && /cluster/home/ifiddes/bin/Rscript {}/scripts/cluster.R {} {} {} {} {} {} {} {}"
            .format(os.getcwd(), tmp_path, base_clust_title, genome,
                    len(filter_set), percent_not_ok, gencode, biotype,
                    out_cluster_file))
Example #6
0
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path):
    """
    Produces one jobTree target per genePred entry. In the future, we could try chunking this per target but during
    initial testing I found that it takes ~15 seconds to extract the RNAseq hints and ~1 minute to run each Augustus
    instance. This seems to be a good time per job to me.
    """
    # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    unsorted_tmp_file = os.path.join(target.getGlobalTempDir(),
                                     getRandomAlphaNumericString(10))
    for line in open(input_gp):
        target.addChildTargetFn(
            transmap_2_aug,
            args=[line, genome, sizes_path, fasta_path, out_file_tree])
    target.setFollowOnTargetFn(
        cat, args=[genome, output_gtf, unsorted_tmp_file, out_file_tree])
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transcript_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n")
    out_path = os.path.join(base_out_path, "clustering", ref_genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference")
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId")
        out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode))
        barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode)
        munged, stats = munge_data(sql_data, biotype_ids)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, attr_path, ref_gp_path, gencode, genome, biotype, base_out_path, method):
    base_clust_title = "Hierarchical_clustering_of_transMap_classifiers"
    base_barplot_title = (
        "Proportion of transcripts that fail transMap classifiers\ngenome: {}.    {:,} ({:0.2f}%) not OK "
        "transcripts \nGencode set: {}    Biotype: {}"
    )
    out_path = os.path.join(base_out_path, biotype, "clustering", method, genome)
    con, cur = attach_databases(comp_ann_path)
    if biotype == "protein_coding":
        classifiers = tm_coding_classifiers
        coding = True
    else:
        classifiers = tm_noncoding_classifiers
        coding = False
    sql_data = load_data(con, genome, classifiers)
    filter_set, num_biotype = find_aln_id_set(cur, attr_path, ref_gp_path, genome, biotype, classifiers)
    if num_biotype > 25 and len(filter_set) > 10:
        percent_not_ok = round(100.0 * len(filter_set) / num_biotype, 2)
        if method == "pre_cluster":
            munged, stats = munge_data(sql_data, filter_set, pre_cluster=True, coding=coding)
        else:
            munged, stats = munge_data(sql_data, filter_set, pre_cluster=False, coding=coding)
        mkdir_p(out_path)
        barplot_title = base_barplot_title.format(genome, len(filter_set), percent_not_ok, gencode, biotype)
        out_barplot_file = os.path.join(out_path, "barplot{}_{}".format(genome, biotype))
        barplot(stats, out_path, out_barplot_file, barplot_title)
        # TODO: why can't I use local temp? R fails inexplicably
        tmp_path = os.path.join(target.getGlobalTempDir(), "{}.txt".format(getRandomAlphaNumericString()))
        munged.to_csv(tmp_path)
        out_cluster_file = os.path.join(out_path, "clustering_{}_{}".format(genome, biotype))
        # TODO: why do we have to use my R?
        system(
            "export R_HOME=/cluster/home/ifiddes/lib64/R && /cluster/home/ifiddes/bin/Rscript {}/scripts/cluster.R {} {} {} {} {} {} {} {}".format(
                os.getcwd(),
                tmp_path,
                base_clust_title,
                genome,
                len(filter_set),
                percent_not_ok,
                gencode,
                biotype,
                out_cluster_file,
            )
        )
def wrapper(target, input_gp, output_gtf, genome, sizes_path, fasta_path,
            hints_db):
    """
    Produces one jobTree target per genePred entry.
    """
    # create a file tree in the global output directory. This tree will store the gtf created by each Augustus instance
    out_file_tree = TempFileTree(target.getGlobalTempDir())
    # this file will be where we reduce the final results to before sorting
    unsorted_tmp_file = os.path.join(target.getGlobalTempDir(),
                                     getRandomAlphaNumericString(10))
    for line in open(input_gp):
        target.addChildTargetFn(transmap_2_aug,
                                memory=8 * (1024**3),
                                args=[
                                    line, genome, sizes_path, fasta_path,
                                    out_file_tree, hints_db
                                ])
    target.setFollowOnTargetFn(
        cat, args=[output_gtf, unsorted_tmp_file, out_file_tree])
Example #10
0
def align(target, g, target_fasta, chunk, ref_fasta, out_path):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    for aug_aId in chunk:
        aId = remove_augustus_alignment_number(aug_aId)
        gencode_id = remove_alignment_number(aId)
        gencode_seq = str(r_f[gencode_id])
        aug_seq = str(g_f[aug_aId])
        tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug")
        tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode")
        fastaWrite(tmp_aug, aug_aId, aug_seq)
        fastaWrite(tmp_gencode, gencode_id, gencode_seq)
        r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug))
        r = r.split("\n")[:-3]
        if len(r) == 0:
            results.append([aug_aId, "0", "0"])
        else:
            p_list = [PslRow(x) for x in r]
            results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)]))
    with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf:
        for x in results:
            outf.write("\t".join(x) + "\n")
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_augustus_classifiers"
    base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n"
                          "on the reference set {} with Augustus {}")
    out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus")
    highest_cov_dict = sql_lib.highest_cov_aln(cur, genome)
    highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
    sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", 
                                 table="augustus")
    base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
    for mode in ["1", "2"]:
        i = "I{}".format(mode)
        aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less"
        filter_set = {x for x in base_filter_set if i in x}
        out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i))
        barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode)
        munged, stats = munge_data(sql_data, filter_set)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transMap_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n"
                          "Genome: {}.  Gencode set: {}.  {:,} ({:0.2f}%) of transcripts")
    out_path = os.path.join(base_out_path, "classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap")
    fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype)
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers)
        num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes")
        for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]):
            mode_underscore = mode.replace("/", "_")
            out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore))
            percentage_of_set = 100.0 * len(ids) / len(biotype_ids)
            barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), 
                                                      percentage_of_set)
            munged, stats = munge_intron_data(sql_data, num_original_introns, ids)
            plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
            data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
            munged.to_csv(data_path)
            out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore))
            target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
Example #13
0
def getCactusInputs_random(regionNumber=0, tempDir=None,
                           sequenceNumber=None,
                           avgSequenceLength=None,
                           treeLeafNumber=None):
    """Gets a random set of sequences, each of length given, and a species
    tree relating them. Each sequence is a assigned an event in this tree.
    """
    if sequenceNumber is None:
        sequenceNumber = random.choice(xrange(30))
    if avgSequenceLength is None:
        avgSequenceLength = random.choice(xrange(1,3000))
    if treeLeafNumber is None:
        treeLeafNumber = random.choice(xrange(2, 4))
    #Make tree
    binaryTree = makeRandomBinaryTree(treeLeafNumber)
    newickTreeString = printBinaryTree(binaryTree, includeDistances=True)
    newickTreeLeafNames = []
    def fn(tree):
        if tree.internal:
            fn(tree.left)
            fn(tree.right)
        else:
            newickTreeLeafNames.append(tree.iD)
    fn(binaryTree)
    logger.info("Made random binary tree: %s" % newickTreeString)
    
    sequenceDirs = []
    for i in xrange(len(newickTreeLeafNames)):
        seqDir = getTempDirectory(rootDir=tempDir)
        sequenceDirs.append(seqDir)

    logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs))

    #Random sequences and species labelling
    sequenceFile = None
    fileHandle = None
    parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1]
    emptySequenceDirs = set(sequenceDirs)
    i = 0
    while i < sequenceNumber or len(emptySequenceDirs) > 0:
        #for i in xrange(sequenceNumber):
        if sequenceFile == None:
            if random.random() > 0.5: #Randomly choose the files to be attached or not
                suffix = ".fa.complete"
            else:
                suffix = ".fa"
            sequenceDir = random.choice(sequenceDirs)
            if sequenceDir in emptySequenceDirs:
                emptySequenceDirs.remove(sequenceDir)
            sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix)
            fileHandle = open(sequenceFile, 'w')
        if random.random() > 0.8: #Get a new root sequence
            parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1]
        sequence = mutateSequence(parentSequence, distance=random.random()*0.5)
        name = getRandomAlphaNumericString(15)
        if random.random() > 0.5:
            sequence = reverseComplement(sequence)
        fastaWrite(fileHandle, name, sequence)
        if random.random() > 0.5:
            fileHandle.close()
            fileHandle = None
            sequenceFile = None
        i += 1
    if fileHandle != None:
        fileHandle.close()

    logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs)))
    
    return sequenceDirs, newickTreeString
Example #14
0
def getCactusInputs_random(regionNumber=0,
                           tempDir=None,
                           sequenceNumber=None,
                           avgSequenceLength=None,
                           treeLeafNumber=None):
    """Gets a random set of sequences, each of length given, and a species
    tree relating them. Each sequence is a assigned an event in this tree.
    """
    if sequenceNumber is None:
        sequenceNumber = random.choice(list(range(30)))
    if avgSequenceLength is None:
        avgSequenceLength = random.choice(list(range(1, 3000)))
    if treeLeafNumber is None:
        treeLeafNumber = random.choice(list(range(2, 4)))

    #Make tree
    binaryTree = makeRandomBinaryTree(treeLeafNumber)
    newickTreeString = printBinaryTree(binaryTree, includeDistances=True)
    newickTreeLeafNames = []

    def fn(tree):
        if tree.internal:
            fn(tree.left)
            fn(tree.right)
        else:
            newickTreeLeafNames.append(tree.iD)

    fn(binaryTree)
    logger.info("Made random binary tree: %s" % newickTreeString)

    sequenceDirs = []
    for i in range(len(newickTreeLeafNames)):
        seqDir = getTempDirectory(rootDir=tempDir)
        sequenceDirs.append(seqDir)

    logger.info("Made a set of random directories: %s" %
                " ".join(sequenceDirs))

    #Random sequences and species labelling
    sequenceFile = None
    fileHandle = None
    parentSequence = getRandomSequence(
        length=random.choice(list(range(1, 2 * avgSequenceLength))))[1]
    emptySequenceDirs = set(sequenceDirs)
    i = 0
    while i < sequenceNumber or len(emptySequenceDirs) > 0:
        if sequenceFile == None:
            if random.random(
            ) > 0.5:  #Randomly choose the files to be attached or not
                suffix = ".fa.complete"
            else:
                suffix = ".fa"
            sequenceDir = random.choice(sequenceDirs)
            if sequenceDir in emptySequenceDirs:
                emptySequenceDirs.remove(sequenceDir)
            sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix)
            fileHandle = open(sequenceFile, 'w')
        if random.random() > 0.8:  #Get a new root sequence
            parentSequence = getRandomSequence(
                length=random.choice(list(range(1, 2 * avgSequenceLength))))[1]
        sequence = mutateSequence(parentSequence,
                                  distance=random.random() * 0.25)
        name = getRandomAlphaNumericString(15)
        if random.random() > 0.5:
            sequence = reverseComplement(sequence)
        fastaWrite(fileHandle, name, sequence)
        if random.random() > 0.5:
            fileHandle.close()
            fileHandle = None
            sequenceFile = None
        i += 1
    if fileHandle != None:
        fileHandle.close()

    logger.info("Made %s sequences in %s directories" %
                (sequenceNumber, len(sequenceDirs)))

    return sequenceDirs, newickTreeString
Example #15
0
    def testTempFileTree(self):
        for test in range(100):  #self.testNo):
            levels = random.choice(range(1, 4))
            fileNo = random.choice(range(1, 6))
            maxTempFiles = int(math.pow(fileNo, levels))

            print("Got %s levels, %s fileNo and %s maxTempFiles" %
                  (levels, fileNo, maxTempFiles))

            tempFileTreeRootDir = os.path.join(self.tempDir,
                                               getRandomAlphaNumericString())
            tempFileTree = TempFileTree(tempFileTreeRootDir, fileNo, levels)

            tempFiles = []
            tempDirs = []
            #Check we can mac number of temp files.
            for i in range(maxTempFiles):
                if random.random() > 0.5:
                    tempFile = tempFileTree.getTempFile()
                    assert os.path.isfile(tempFile)
                    tempFiles.append(tempFile)
                else:
                    tempFile = tempFileTree.getTempDirectory()
                    assert os.path.isdir(tempFile)
                    tempDirs.append(tempFile)

            #Check assertion is created
            try:
                tempFileTree.getTempFile()
                assert False
            except RuntimeError:
                logger.debug("Got expected error message")

            #Now remove a few temp files
            while random.random() > 0.1 and len(tempFiles) > 0:
                tempFile = tempFiles.pop()
                assert os.path.isfile(tempFile)
                tempFileTree.destroyTempFile(tempFile)
                assert not os.path.isfile(tempFile)

            #Now remove a few temp dirs
            while random.random() > 0.1 and len(tempDirs) > 0:
                tempDir = tempDirs.pop()
                assert os.path.isdir(tempDir)
                tempFileTree.destroyTempDir(tempDir)
                assert not os.path.isdir(tempDir)

            #Check temp files is okay
            set(tempFileTree.listFiles()) == set(tempFiles + tempDirs)

            #Either remove all the temp files or just destroy the whole thing
            if random.random() > 0.5:
                #Remove all temp files and check thing is empty.
                for tempFile in tempFiles:
                    tempFileTree.destroyTempFile(tempFile)
                for tempDir in tempDirs:
                    tempFileTree.destroyTempDir(tempDir)
                os.remove(os.path.join(tempFileTreeRootDir, "lock"))
                os.rmdir(tempFileTreeRootDir)
            else:
                tempFileTree.destroyTempFiles()
                assert not os.path.isdir(tempFileTreeRootDir)
Example #16
0
 def __init__(self, depth=0):
     Target.__init__(self, time=random.random() * 10)
     self.tempFileName = getRandomAlphaNumericString()
     self.depth = depth
 def __init__(self, depth=0):
     Target.__init__(self, time=random.random() * 10)
     self.tempFileName = getRandomAlphaNumericString()
     self.depth = depth