def MoveGeneMarkFiles(workdir, genome): """ Handles temporary folders/files created by GeneMark-ES. """ # GeneMark-ES produces these filenames for each genome run. to_move = ["data", "info", "output", "run", "gmes.log", "run.cfg", "prot_seq.faa", "nuc_seq.fna", "genemark.gtf"] # Attempt to make GeneMark-ES temporary file folder if not extant. gmes = "{0}/gmes/{1}/".format(workdir, genome) TryMkDirs(gmes) # Move all files and folders to new folder. logging.info("PanGuess: Moving/Removing GeneMark-ES temporary files and folders.") for f in to_move: if os.path.isdir(f): if not os.path.isdir("{0}/{1}".format(gmes, f)): shutil.move(f, gmes) else: shutil.rmtree(f) elif os.path.isfile(f): if not os.path.isfile("{0}/{1}".format(gmes, f)): shutil.move(f, gmes) else: os.remove(f)
def ConstructGeneModelSets(attributes, exonerate_genes, workdir, genome, tag): """ Build completed gene model set for genome from our three sources. """ # Temporary gene/protein sets from GeneMark-ES and TransDecoder. #gm_prot_db = SeqIO.index("{0}/gmes/{1}/prot_seq.faa".format(workdir, genome), "fasta") #gm_nucl_db = SeqIO.index("{0}/gmes/{1}/nuc_seq.fna".format(workdir, genome), "fasta") #td_prot_db = SeqIO.index("{0}/td/{1}/NCR.fna.transdecoder.pep".format(workdir, genome), "fasta") #td_nucl_db = SeqIO.index("{0}/td/{1}/NCR.fna.transdecoder.cds".format(workdir, genome), "fasta") # Master lists. prot_models = [] nucl_models = [] # Try to make a directory for protein sets. sdir = "{0}/sets".format(workdir) TryMkDirs(sdir) # Loop over attributes, extract gene from given source based on parent method. for gene in attributes: #if gene[4].startswith("TransDecoder"): # prot_seq = td_prot_db[gene[1]] # nucl_seq = td_nucl_db[gene[1]] # prot_seq.id = "{0}|{1}_{2}_{3}".format(tag, gene[0], gene[2], gene[3]) # nucl_seq.id = prot_seq.id # gene[1] = prot_seq.id # prot_models.append(prot_seq) # nucl_models.append(nucl_seq) #elif gene[4].startswith("GeneMark"): # prot_seq = gm_prot_db[gene[1]] # nucl_seq = gm_nucl_db[gene[1]] # prot_seq.id = "{0}|{1}_{2}_{3}".format(tag, gene[0], gene[2], gene[3]) # nucl_seq.id = prot_seq.id # gene[1] = prot_seq.id # prot_models.append(prot_seq) # nucl_models.append(nucl_seq) if gene[4].startswith("Exonerate"): match = filter(lambda x: x.id == gene[1], exonerate_genes) prot_seq = SeqRecord(Seq(match[0].prot), id=match[0].id) nucl_seq = SeqRecord(Seq(match[0].nucl), id=match[0].id) prot_seq.id = "{0}|{1}".format(tag, prot_seq.id) nucl_seq.id = "{0}|{1}".format(tag, nucl_seq.id) gene[1] = prot_seq.id prot_models.append(prot_seq) nucl_models.append(nucl_seq) # Write protein sequences to file. with open("{0}/{1}.faa".format(sdir, tag), "w") as outpro: SeqIO.write(prot_models, outpro, "fasta") # Write nucleotide sequences to file. with open("{0}/{1}.nucl".format(sdir, tag), "w") as outnuc: SeqIO.write(nucl_models, outnuc, "fasta") # Write attributes to file. with open("{0}/{1}.attributes".format(sdir, tag), "w") as outatt: for line in attributes: outatt.write("\t".join(str(el) for el in line) + "\n")
def BuildRefSet(workdir, ref): """ Build temporary set of reference proteins. It's faster to run Exonerate by splitting up the dataset into individual files and running them as separate queries against the genome than as a full file. """ # Make folder for reference proteins, if not already present. ref_folder = "{0}/ref".format(workdir) TryMkDirs(ref_folder) # Split user-provided reference set into individual proteins (have to do this). ref_db = SeqIO.index(ref, "fasta") logging.info("PanGuess: Building reference protein sequence dataset.") for seq in ref_db: SeqIO.write(ref_db[seq], "{0}/{1}.faa".format(ref_folder, ref_db[seq].id), "fasta") ref_db.close()
def KaryoPloteR(tags, karyotypes, lengths): """ Run Karyoplot.R for all strains in a dataset and write the plots to the karyplots folder. """ karyopath = os.path.dirname(os.path.realpath(sys.argv[0])) + "/Karyotype.R" sp.call(["Rscript", karyopath, tags, karyotypes, lengths]) # Don't rewrite work directory if already there. kdir = "./karyoplots" TryMkDirs(kdir) for tag in open(tags).readlines(): shutil.copy("{0}_components.eps".format(tag.strip("\n")), kdir) shutil.copy("{0}_orthologs.eps".format(tag.strip("\n")), kdir) os.remove("{0}_components.eps".format(tag.strip("\n"))) os.remove("{0}_orthologs.eps".format(tag.strip("\n")))
def RunBUSCO(buscopath, lineagepath, gene_sets): """ Runs BUSCO analysis on every protein set and writes output files to BUSCO folder. """ bdir = "./busco" # Don't rewrite work directory if already there. TryMkDirs(bdir) for gene_set in gene_sets: wd = gene_set.split("/")[-1] cmd = [ buscopath, "-i", gene_set, "-l", lineagepath, "-o", "{0}.busco".format(wd), "-m", "prot" ] print "Running BUSCO" sp.call(cmd) shutil.move("run_{0}.busco".format(wd), bdir)
def RunTransDecoder(ncr, tp_path, tl_path, workdir, genome, td_len): """ Run the two TransDecoder commands via the command line. """ # Try to make a directory for TransDecoder. Might as well do it now. tdir = "{0}/td/{1}/".format(workdir, genome) TryMkDirs(tdir) # Write NCRs to FASTA file with open("{0}/NCR.fna".format(tdir), "w") as outfile: for line in ncr: outfile.write(line) # Run both TransDecoder processes sequentially. sp.call([tl_path, "-t", "{0}/NCR.fna".format(tdir), "-m", "{0}".format(td_len)]) sp.call([tp_path, "-t", "{0}/NCR.fna".format(tdir), "--single_best_only"]) # Return the TransDecoder directory for MoveTransDecoderFiles. return tdir
def PanOCTOutputHandler(): """ Move expected PanOCT output (might differ from what user actually specifies) to dedicated PanOCT output directory. """ to_move = glob("*pairwise*") + glob("*cluster*") + glob("*paralog*") \ + glob("*matchtable*") + ["centroids.fasta", "fragments_fusions.txt", "id.txt", "missing_blast_results.txt", "parameters.txt", "report.txt"] tdir = "panoct" TryMkDirs(tdir) for f in to_move: if os.path.isdir(f): if not os.path.isdir("{0}/{1}".format(tdir, f)): shutil.move(f, tdir) else: shutil.rmtree(f) elif os.path.isfile(f): if not os.path.isfile("{0}/{1}".format(tdir, f)): shutil.move(f, tdir) else: os.remove(f)
def MakeWorkingDirs(): """ Tries to make work directory if not already present. """ tdir = "go" TryMkDirs(tdir)
def MakeWorkingDir(workdir): """ Tries to make work directory if not already present. """ # Don't rewrite work directory if already there. TryMkDirs(workdir)
def GenerateClusterFASTAs(genomes, refined=False): """ Extract gene model clusters from full database and write out nucleotide and protein sequence families to file. """ if not os.path.isfile("./gm_pred/sets/allnucl.db"): ConcatenateDatasets(genomes) elif not os.path.isfile("./gm_pred/sets/allprot.db"): ConcatenateDatasets(genomes) nt_index = SeqIO.index("./gm_pred/sets/allnucl.db", "fasta") aa_index = SeqIO.index("./gm_pred/sets/allprot.db", "fasta") fdir = "./panoct/clusters/" matchtable = "./panoct/matchtable.txt" TryMkDirs(fdir) TryMkDirs("{0}/core/faa".format(fdir)) TryMkDirs("{0}/core/fna".format(fdir)) TryMkDirs("{0}/acc/faa".format(fdir)) TryMkDirs("{0}/acc/fna".format(fdir)) core, acc = ParseMatchtable(matchtable) for cluster in core: nt_seqs = [nt_index[member] for member in core[cluster]] aa_seqs = [aa_index[member] for member in core[cluster]] with open("{0}/core/fna/Core_{1}.fna".format(fdir, cluster), "w") as aa_out: SeqIO.write(nt_seqs, aa_out, "fasta") with open("{0}/core/faa/Core_{1}.faa".format(fdir, cluster), "w") as aa_out: SeqIO.write(aa_seqs, aa_out, "fasta") for cluster in acc: nt_seqs = [nt_index[member] for member in acc[cluster] if member] aa_seqs = [aa_index[member] for member in acc[cluster] if member] with open("{0}/acc/fna/Acc_{1}.fna".format(fdir, cluster), "w") as aa_out: SeqIO.write(nt_seqs, aa_out, "fasta") with open("{0}/acc/faa/Acc_{1}.faa".format(fdir, cluster), "w") as aa_out: SeqIO.write(aa_seqs, aa_out, "fasta") if refined: matchtable = "./panoct/refined_matchtable.txt" rdir = "./panoct/clusters/refined" TryMkDirs(rdir) TryMkDirs("{0}/core/faa".format(rdir)) TryMkDirs("{0}/core/fna".format(rdir)) TryMkDirs("{0}/acc/faa".format(rdir)) TryMkDirs("{0}/acc/fna".format(rdir)) core, acc = ParseMatchtable(matchtable) for cluster in core: nt_seqs = [nt_index[member] for member in core[cluster]] aa_seqs = [aa_index[member] for member in core[cluster]] with open("{0}/core/fna/Core_{1}.fna".format(rdir, cluster), "w") as aa_out: SeqIO.write(nt_seqs, aa_out, "fasta") with open("{0}/core/faa/Core_{1}.faa".format(rdir, cluster), "w") as aa_out: SeqIO.write(aa_seqs, aa_out, "fasta") for cluster in acc: nt_seqs = [nt_index[member] for member in acc[cluster] if member] aa_seqs = [aa_index[member] for member in acc[cluster] if member] with open("{0}/acc/fna/Acc_{1}.fna".format(rdir, cluster), "w") as aa_out: SeqIO.write(nt_seqs, aa_out, "fasta") with open("{0}/acc/faa/Acc_{1}.faa".format(rdir, cluster), "w") as aa_out: SeqIO.write(aa_seqs, aa_out, "fasta")
def RemoveDubiousCalls(results, sets): """ """ logging.info("QualityCheck: Filtering gene model sets for dubious calls.") # Master list for calls to remove. to_remove = [] # Loop through all QCBLAST results, flag top-hits that have >=70% sequence coverage with a dubious gene. for result in results: for query in result: if query.hits: query_len = query.seq_len subj_len = query.hits[0].seq_len ratio = min(query_len, subj_len) / max(query_len, subj_len) if ratio >= 0.7: to_remove.append(query.hits[0].id) logging.info( "QualityCheck: {0} has >=70% length overlap with {1}, assigning {0} as a" " dubious call.".format(query.hits[0].id, query.id)) # Remove flagged calls from nucleotide and protein sets, and genomic attributes file. for path in sets: genome = path.split("/")[-1] tag = genome.split(".")[0] tr_strain = filter(lambda x: x.split("|")[0] == tag, to_remove) if tr_strain: aa_path = "./gm_pred/sets/{0}.faa".format(tag) nt_path = "./gm_pred/sets/{0}.nucl".format(tag) at_path = "./gm_pred/sets/{0}.attributes".format(tag) current_prot = list(SeqIO.parse(open(aa_path), "fasta")) current_nucl = list(SeqIO.parse(open(nt_path), "fasta")) current_att = list(reader(open(at_path), delimiter="\t")) to_move = [aa_path, nt_path, at_path] TryMkDirs("./gm_pred/sets/old/") new_prot = filter(lambda x: x.id not in tr_strain, current_prot) new_nucl = filter(lambda x: x.id not in tr_strain, current_nucl) new_att = filter(lambda x: x[1] not in tr_strain, current_att) logging.info("QualityCheck: Removed {0} dubious calls from {1}," " writing remaining calls to new files.".format( len(tr_strain), genome)) logging.info("QualityCheck: Moving old calls.") for f in to_move: shutil.copy(f, "./gm_pred/sets/old/") # Write protein sequences to file. with open(aa_path, "w") as outpro: SeqIO.write(new_prot, outpro, "fasta") # Write nucleotide sequences to file. with open(nt_path, "w") as outnuc: SeqIO.write(new_nucl, outnuc, "fasta") # Write attributes to file. with open(at_path, "w") as outatt: for line in new_att: outatt.write("\t".join(str(el) for el in line) + "\n") logging.info( "QualityCheck: Completed removal of dubious calls from all datasets.")