class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([ prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf): if (args_sorf.gffs is None) or \ (args_sorf.trans is None) or \ ((args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: lack required files for UTR derived" " sORF detection!!!!") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf): '''detect the sORF based on start and stop codon and ribosome binding site''' for prefix in prefixs: if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"]))) def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf): '''compare transcript and CDS to find the intergenic region''' prefixs = [] for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcript and CDS of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo) return prefixs def run_sorf_detection(self, args_sorf): if args_sorf.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self._check_necessary_files(args_sorf) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf) self._start_stop_codon(prefixs, args_sorf) for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])), args_sorf.utr_detect) self._remove_tmp(args_sorf)
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [ self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2 ] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = [ "test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-" ] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class sRNADetection(object): '''detection of sRNA''' def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") } def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([ os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file ], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or (database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): '''merge the results of fragmented and tex treated libs''' if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _read_lib_wig(self, args_srna): libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) return [libs, texs, wigs_f, wigs_r] def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): '''detection of intergenic and antisense sRNA''' tex_datas = None frag_datas = None if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if self.tss_path is not None: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file(self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) frag_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) tex_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if ("TSS_class" in os.listdir( args_srna.out_folder)) and (not args_srna.tss_source): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss, frag_datas, tex_datas def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas): '''detection of UTR-derived sRNA''' if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.import_info is not None: if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( args_srna.nr_database is not None) or (args_srna.srna_database is not None): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _merge_tex_frag_datas(self, tex_datas, frag_datas): if (tex_datas is not None) and (frag_datas is not None): for index in [2, 3]: for strain, conds in frag_datas[index].items(): if strain not in tex_datas[index].keys(): tex_datas[index][strain] = conds else: for cond, tracks in conds.items(): tex_datas[index][strain][cond] = tracks elif (tex_datas is None) and (frag_datas is not None): tex_datas = frag_datas return tex_datas def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gffs = { "merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix]) } csvs = { "merge": "_".join([self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([self.prefixs["normal_table"], prefix]) } tss, frag_datas, tex_datas = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas) tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas) del frag_datas gc.collect() self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss, tex_datas) del tex_datas filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss, tex_datas): print("merging data of sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join([ "cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file ])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): '''extract the sec str energy''' detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy( os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return { "sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path) } def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([ os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file) ])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([ os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file) ], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec(self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([ os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file ]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0} in {1}".format(prefix, database)) seq_file = os.path.join(args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): '''classify the sRNA based on the filters''' if (args_srna.import_info is not None) or (args_srna.srna_database is not None) or ( args_srna.nr_database is not None) or (self.sorf_path is not None) or ( self.tss_path is not None) or (self.term_path is not None) or ( args_srna.promoter_table is not None): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join( class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join(["stat_sRNA_class", prefix + ".csv"])) classify_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join(class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _get_best_result(self, prefixs, args_srna): '''get the best results based on the filters''' for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table( os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table, self.term_path) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) self.helper.remove_tmp(self.gff_output) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): '''set the filter of sRNA''' if args_srna.import_info is not None: if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if args_srna.nr_database is not None: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if self.sorf_path is not None: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) if args_srna.srna_database is not None: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): '''filter out the low expressed UTR-derived sRNA''' for prefix in prefixs: filter_utr( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): '''detection of antisense''' for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): '''do statistics for blast result''' for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): '''compare sRNA with terminator and promoter''' if self.term_path is not None: compare_srna_term( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if (args_srna.promoter_table is not None): compare_srna_promoter( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") if args_srna.import_info is not None: args_srna.import_info = self._import_info_format( args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter( "_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if args_srna.srna_database is not None: if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection( os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic( os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join( [prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join( args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)