Example #1
0
def MoveGeneMarkFiles(workdir, genome):
    """
    Handles temporary folders/files created by GeneMark-ES.
    """
    # GeneMark-ES produces these filenames for each genome run.
    to_move = ["data", "info", "output", "run", "gmes.log", "run.cfg",
               "prot_seq.faa", "nuc_seq.fna", "genemark.gtf"]

    # Attempt to make GeneMark-ES temporary file folder if not extant.
    gmes = "{0}/gmes/{1}/".format(workdir, genome)
    TryMkDirs(gmes)

    # Move all files and folders to new folder.
    logging.info("PanGuess: Moving/Removing GeneMark-ES temporary files and folders.")
    for f in to_move:
        if os.path.isdir(f):
            if not os.path.isdir("{0}/{1}".format(gmes, f)):
                shutil.move(f, gmes)
            else:
                shutil.rmtree(f)
        elif os.path.isfile(f):
            if not os.path.isfile("{0}/{1}".format(gmes, f)):
                shutil.move(f, gmes)
            else:
                os.remove(f)
Example #2
0
def ConstructGeneModelSets(attributes, exonerate_genes, workdir, genome, tag):
    """
    Build completed gene model set for genome from our three sources.
    """
    # Temporary gene/protein sets from GeneMark-ES and TransDecoder.
    #gm_prot_db = SeqIO.index("{0}/gmes/{1}/prot_seq.faa".format(workdir, genome), "fasta")
    #gm_nucl_db = SeqIO.index("{0}/gmes/{1}/nuc_seq.fna".format(workdir, genome), "fasta")
    #td_prot_db = SeqIO.index("{0}/td/{1}/NCR.fna.transdecoder.pep".format(workdir, genome), "fasta")
    #td_nucl_db = SeqIO.index("{0}/td/{1}/NCR.fna.transdecoder.cds".format(workdir, genome), "fasta")

    # Master lists.
    prot_models = []
    nucl_models = []

    # Try to make a directory for protein sets.
    sdir = "{0}/sets".format(workdir)
    TryMkDirs(sdir)


    # Loop over attributes, extract gene from given source based on parent method.
    for gene in attributes:
        #if gene[4].startswith("TransDecoder"):
        #    prot_seq = td_prot_db[gene[1]]
        #    nucl_seq = td_nucl_db[gene[1]]
        #    prot_seq.id = "{0}|{1}_{2}_{3}".format(tag, gene[0], gene[2], gene[3])
        #    nucl_seq.id = prot_seq.id
        #    gene[1] = prot_seq.id
        #    prot_models.append(prot_seq)
        #    nucl_models.append(nucl_seq)
        #elif gene[4].startswith("GeneMark"):
        #    prot_seq = gm_prot_db[gene[1]]
        #    nucl_seq = gm_nucl_db[gene[1]]
        #    prot_seq.id = "{0}|{1}_{2}_{3}".format(tag, gene[0], gene[2], gene[3])
        #    nucl_seq.id = prot_seq.id
        #    gene[1] = prot_seq.id
        #    prot_models.append(prot_seq)
        #    nucl_models.append(nucl_seq)
        if gene[4].startswith("Exonerate"):
            match = filter(lambda x: x.id == gene[1], exonerate_genes)
            prot_seq = SeqRecord(Seq(match[0].prot), id=match[0].id)
            nucl_seq = SeqRecord(Seq(match[0].nucl), id=match[0].id)
            prot_seq.id = "{0}|{1}".format(tag, prot_seq.id)
            nucl_seq.id = "{0}|{1}".format(tag, nucl_seq.id)
            gene[1] = prot_seq.id
            prot_models.append(prot_seq)
            nucl_models.append(nucl_seq)

    # Write protein sequences to file.
    with open("{0}/{1}.faa".format(sdir, tag), "w") as outpro:
        SeqIO.write(prot_models, outpro, "fasta")

    # Write nucleotide sequences to file.
    with open("{0}/{1}.nucl".format(sdir, tag), "w") as outnuc:
        SeqIO.write(nucl_models, outnuc, "fasta")

    # Write attributes to file.
    with open("{0}/{1}.attributes".format(sdir, tag), "w") as outatt:
        for line in attributes:
            outatt.write("\t".join(str(el) for el in line) + "\n")
Example #3
0
def BuildRefSet(workdir, ref):
    """
    Build temporary set of reference proteins. It's faster to run Exonerate by splitting
    up the dataset into individual files and running them as separate queries against
    the genome than as a full file.
    """
    # Make folder for reference proteins, if not already present.
    ref_folder = "{0}/ref".format(workdir)
    TryMkDirs(ref_folder)

    # Split user-provided reference set into individual proteins (have to do this).
    ref_db = SeqIO.index(ref, "fasta")
    logging.info("PanGuess: Building reference protein sequence dataset.")
    for seq in ref_db:
        SeqIO.write(ref_db[seq], "{0}/{1}.faa".format(ref_folder, ref_db[seq].id), "fasta")
    ref_db.close()
Example #4
0
def KaryoPloteR(tags, karyotypes, lengths):
    """
    Run Karyoplot.R for all strains in a dataset and write the plots to the karyplots folder.
    """
    karyopath = os.path.dirname(os.path.realpath(sys.argv[0])) + "/Karyotype.R"
    sp.call(["Rscript", karyopath, tags, karyotypes, lengths])

    # Don't rewrite work directory if already there.
    kdir = "./karyoplots"
    TryMkDirs(kdir)

    for tag in open(tags).readlines():
        shutil.copy("{0}_components.eps".format(tag.strip("\n")), kdir)
        shutil.copy("{0}_orthologs.eps".format(tag.strip("\n")), kdir)
        os.remove("{0}_components.eps".format(tag.strip("\n")))
        os.remove("{0}_orthologs.eps".format(tag.strip("\n")))
Example #5
0
def RunBUSCO(buscopath, lineagepath, gene_sets):
    """
    Runs BUSCO analysis on every protein set and writes output files to BUSCO folder.
    """
    bdir = "./busco"

    # Don't rewrite work directory if already there.
    TryMkDirs(bdir)

    for gene_set in gene_sets:
        wd = gene_set.split("/")[-1]
        cmd = [
            buscopath, "-i", gene_set, "-l", lineagepath, "-o",
            "{0}.busco".format(wd), "-m", "prot"
        ]
        print "Running BUSCO"
        sp.call(cmd)
        shutil.move("run_{0}.busco".format(wd), bdir)
Example #6
0
def RunTransDecoder(ncr, tp_path, tl_path, workdir, genome, td_len):
    """
    Run the two TransDecoder commands via the command line.
    """
    # Try to make a directory for TransDecoder. Might as well do it now.
    tdir = "{0}/td/{1}/".format(workdir, genome)
    TryMkDirs(tdir)

    # Write NCRs to FASTA file
    with open("{0}/NCR.fna".format(tdir), "w") as outfile:
        for line in ncr:
            outfile.write(line)

    # Run both TransDecoder processes sequentially.
    sp.call([tl_path, "-t", "{0}/NCR.fna".format(tdir), "-m", "{0}".format(td_len)])
    sp.call([tp_path, "-t", "{0}/NCR.fna".format(tdir), "--single_best_only"])

    # Return the TransDecoder directory for MoveTransDecoderFiles.
    return tdir
Example #7
0
def PanOCTOutputHandler():
    """
    Move expected PanOCT output (might differ from what user actually specifies) to dedicated
    PanOCT output directory.
    """
    to_move = glob("*pairwise*") + glob("*cluster*") + glob("*paralog*") \
              + glob("*matchtable*") + ["centroids.fasta", "fragments_fusions.txt", "id.txt",
                                       "missing_blast_results.txt", "parameters.txt", "report.txt"]

    tdir = "panoct"
    TryMkDirs(tdir)

    for f in to_move:
        if os.path.isdir(f):
            if not os.path.isdir("{0}/{1}".format(tdir, f)):
                shutil.move(f, tdir)
            else:
                shutil.rmtree(f)
        elif os.path.isfile(f):
            if not os.path.isfile("{0}/{1}".format(tdir, f)):
                shutil.move(f, tdir)
            else:
                os.remove(f)
Example #8
0
def MakeWorkingDirs():
    """
    Tries to make work directory if not already present.
    """
    tdir = "go"
    TryMkDirs(tdir)
Example #9
0
def MakeWorkingDir(workdir):
    """
    Tries to make work directory if not already present.
    """
    # Don't rewrite work directory if already there.
    TryMkDirs(workdir)
Example #10
0
def GenerateClusterFASTAs(genomes, refined=False):
    """
    Extract gene model clusters from full database and write out nucleotide and protein sequence families to file.
    """
    if not os.path.isfile("./gm_pred/sets/allnucl.db"):
        ConcatenateDatasets(genomes)
    elif not os.path.isfile("./gm_pred/sets/allprot.db"):
        ConcatenateDatasets(genomes)
    nt_index = SeqIO.index("./gm_pred/sets/allnucl.db", "fasta")
    aa_index = SeqIO.index("./gm_pred/sets/allprot.db", "fasta")
    fdir = "./panoct/clusters/"
    matchtable = "./panoct/matchtable.txt"
    TryMkDirs(fdir)
    TryMkDirs("{0}/core/faa".format(fdir))
    TryMkDirs("{0}/core/fna".format(fdir))
    TryMkDirs("{0}/acc/faa".format(fdir))
    TryMkDirs("{0}/acc/fna".format(fdir))

    core, acc = ParseMatchtable(matchtable)

    for cluster in core:
        nt_seqs = [nt_index[member] for member in core[cluster]]
        aa_seqs = [aa_index[member] for member in core[cluster]]
        with open("{0}/core/fna/Core_{1}.fna".format(fdir, cluster),
                  "w") as aa_out:
            SeqIO.write(nt_seqs, aa_out, "fasta")

        with open("{0}/core/faa/Core_{1}.faa".format(fdir, cluster),
                  "w") as aa_out:
            SeqIO.write(aa_seqs, aa_out, "fasta")

    for cluster in acc:
        nt_seqs = [nt_index[member] for member in acc[cluster] if member]
        aa_seqs = [aa_index[member] for member in acc[cluster] if member]
        with open("{0}/acc/fna/Acc_{1}.fna".format(fdir, cluster),
                  "w") as aa_out:
            SeqIO.write(nt_seqs, aa_out, "fasta")

        with open("{0}/acc/faa/Acc_{1}.faa".format(fdir, cluster),
                  "w") as aa_out:
            SeqIO.write(aa_seqs, aa_out, "fasta")

    if refined:
        matchtable = "./panoct/refined_matchtable.txt"
        rdir = "./panoct/clusters/refined"
        TryMkDirs(rdir)
        TryMkDirs("{0}/core/faa".format(rdir))
        TryMkDirs("{0}/core/fna".format(rdir))
        TryMkDirs("{0}/acc/faa".format(rdir))
        TryMkDirs("{0}/acc/fna".format(rdir))

        core, acc = ParseMatchtable(matchtable)

        for cluster in core:
            nt_seqs = [nt_index[member] for member in core[cluster]]
            aa_seqs = [aa_index[member] for member in core[cluster]]
            with open("{0}/core/fna/Core_{1}.fna".format(rdir, cluster),
                      "w") as aa_out:
                SeqIO.write(nt_seqs, aa_out, "fasta")

            with open("{0}/core/faa/Core_{1}.faa".format(rdir, cluster),
                      "w") as aa_out:
                SeqIO.write(aa_seqs, aa_out, "fasta")

        for cluster in acc:
            nt_seqs = [nt_index[member] for member in acc[cluster] if member]
            aa_seqs = [aa_index[member] for member in acc[cluster] if member]
            with open("{0}/acc/fna/Acc_{1}.fna".format(rdir, cluster),
                      "w") as aa_out:
                SeqIO.write(nt_seqs, aa_out, "fasta")

            with open("{0}/acc/faa/Acc_{1}.faa".format(rdir, cluster),
                      "w") as aa_out:
                SeqIO.write(aa_seqs, aa_out, "fasta")
Example #11
0
def RemoveDubiousCalls(results, sets):
    """


    """
    logging.info("QualityCheck: Filtering gene model sets for dubious calls.")
    # Master list for calls to remove.
    to_remove = []

    # Loop through all QCBLAST results, flag top-hits that have >=70% sequence coverage with a dubious gene.
    for result in results:
        for query in result:
            if query.hits:
                query_len = query.seq_len
                subj_len = query.hits[0].seq_len
                ratio = min(query_len, subj_len) / max(query_len, subj_len)
                if ratio >= 0.7:
                    to_remove.append(query.hits[0].id)
                    logging.info(
                        "QualityCheck: {0} has >=70% length overlap with {1}, assigning {0} as a"
                        " dubious call.".format(query.hits[0].id, query.id))

    # Remove flagged calls from nucleotide and protein sets, and genomic attributes file.
    for path in sets:
        genome = path.split("/")[-1]
        tag = genome.split(".")[0]
        tr_strain = filter(lambda x: x.split("|")[0] == tag, to_remove)
        if tr_strain:
            aa_path = "./gm_pred/sets/{0}.faa".format(tag)
            nt_path = "./gm_pred/sets/{0}.nucl".format(tag)
            at_path = "./gm_pred/sets/{0}.attributes".format(tag)
            current_prot = list(SeqIO.parse(open(aa_path), "fasta"))
            current_nucl = list(SeqIO.parse(open(nt_path), "fasta"))
            current_att = list(reader(open(at_path), delimiter="\t"))
            to_move = [aa_path, nt_path, at_path]
            TryMkDirs("./gm_pred/sets/old/")

            new_prot = filter(lambda x: x.id not in tr_strain, current_prot)
            new_nucl = filter(lambda x: x.id not in tr_strain, current_nucl)
            new_att = filter(lambda x: x[1] not in tr_strain, current_att)

            logging.info("QualityCheck: Removed {0} dubious calls from {1},"
                         " writing remaining calls to new files.".format(
                             len(tr_strain), genome))

            logging.info("QualityCheck: Moving old calls.")
            for f in to_move:
                shutil.copy(f, "./gm_pred/sets/old/")

            # Write protein sequences to file.
            with open(aa_path, "w") as outpro:
                SeqIO.write(new_prot, outpro, "fasta")

            # Write nucleotide sequences to file.
            with open(nt_path, "w") as outnuc:
                SeqIO.write(new_nucl, outnuc, "fasta")

            # Write attributes to file.
            with open(at_path, "w") as outatt:
                for line in new_att:
                    outatt.write("\t".join(str(el) for el in line) + "\n")

    logging.info(
        "QualityCheck: Completed removal of dubious calls from all datasets.")