class TestHelper(unittest.TestCase): def setUp(self): self.example = ExampleData() self.helper = Helper() self.gff_out = self.example.gff_out self.rev_seq = self.example.rev_seq.replace("\n", "") self.test_folder = "test_folder" if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) self.gff_file = os.path.join(self.test_folder, "test.gff") with open(self.gff_file, "w") as rh: rh.write(self.example.gff_file) self.seq_file = os.path.join(self.test_folder, "test.fa") with open(self.seq_file, "w") as rh: rh.write(self.example.seq) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_remove_all_content(self): tmp1 = os.path.join(self.test_folder, "tmp1.gff") tmp2 = os.path.join(self.test_folder, "tmp2") shutil.copyfile(self.gff_file, tmp1) os.mkdir(tmp2) self.helper.remove_all_content(self.test_folder, "tmp", "file") self.assertFalse(os.path.exists(tmp1)) self.assertTrue(os.path.exists(tmp2)) self.helper.remove_all_content(self.test_folder, "tmp", "dir") self.assertFalse(os.path.exists(tmp2)) self.assertTrue(os.path.exists(self.gff_file)) def test_remove_tmp(self): tmp1 = os.path.join(self.test_folder, "tmp") tmp2 = os.path.join(self.test_folder, "test.gff_folder") os.mkdir(tmp1) os.mkdir(tmp2) self.helper.remove_tmp(self.test_folder) self.assertFalse(os.path.exists(tmp1)) self.assertFalse(os.path.exists(tmp2)) def test_get_correct_file(self): gff_file = os.path.join(self.test_folder, "test.gff") wig_f_file = os.path.join(self.test_folder, "test_forward.wig_STRAIN_aaa.wig") wig_r_file = os.path.join(self.test_folder, "test_reverse.wig_STRAIN_aaa.wig") shutil.copyfile(gff_file, wig_f_file) shutil.copyfile(gff_file, wig_r_file) libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"] filename = self.helper.get_correct_file( self.test_folder, ".gff", "test", None, libs) self.assertEqual(filename, gff_file) def test_sorf_gff(self): out_file = os.path.join(self.test_folder, "test.out") self.helper.sort_gff(self.gff_file, out_file) datas = import_data(out_file) self.assertEqual(set(datas), set(self.gff_out.split("\n"))) def test_extract_gene(self): seq = self.example.seq.replace("\n", "") new_seq = self.helper.extract_gene(seq, 1, 70, "+") self.assertEqual(new_seq, "CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT") new_seq = self.helper.extract_gene(seq, 1, 140, "-") self.assertEqual(new_seq, self.rev_seq) def test_get_seq(self): gff_file = os.path.join(self.test_folder, "test.gff") out_file = os.path.join(self.test_folder, "test.cds") lines = self.example.gff_out.split("\n") with open(gff_file, "w") as gh: gh.write(lines[1]) self.helper.get_seq(self.gff_file, self.seq_file, out_file) datas = import_data(out_file) self.assertEqual(set(datas), set([">cds0|aaa|1|10|+", "CGCAGGTTGA"]))
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class OperonDetection(object): '''detection of operon''' def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() if args_op.tsss is not None: self.tss_path = os.path.join(args_op.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_op.trans, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op, log): log.write("Running detect_operon.py to detect operon.\n") log.write("The the following files are generated:\n") for prefix in prefixs: out_gff = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "operon.gff"])) out_table = os.path.join(self.table_path, "_".join([prefix, "operon.csv"])) print("Detecting operons of {0}".format(prefix)) if self.tss_path is None: tss = False else: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table, out_gff) log.write("\t" + out_table + "\n") log.write("\t" + out_gff + "\n") def _check_and_parser_gff(self, args_op): self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") if args_op.tsss is not None: self._check_gff(args_op.tsss, "tss") self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder, log): log.write("Running stat_operon.py to do statistics.\n") for table in os.listdir(table_path): if table.endswith("_operon.csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) log.write("\t" + out_stat + "\n") def run_operon(self, args_op, log): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op, log) self._stat(self.table_path, args_op.stat_folder, log) self.helper.remove_tmp_dir(args_op.gffs) self.helper.remove_tmp_dir(args_op.tsss) self.helper.remove_tmp_dir(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp_dir(args_op.terms)
class OperonDetection(object): def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op): for prefix in prefixs: out_table = os.path.join(self.table_path, "_".join(["operon", prefix + ".csv"])) print("Detection operons of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table) def _check_and_parser_gff(self, args_op): self._check_gff(args_op.tsss, "tss") self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self._check_gff(args_op.utr5s, "utr") self._check_gff(args_op.utr3s, "utr") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") self.multiparser.parser_gff(args_op.utr5s, "5UTR") self.multiparser.combine_gff(args_op.gffs, self.utr5_path, None, "5UTR") self.multiparser.parser_gff(args_op.utr3s, "3UTR") self.multiparser.combine_gff(args_op.gffs, self.utr3_path, None, "3UTR") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder): for table in os.listdir(table_path): if table.startswith("operon_") and table.endswith(".csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) def _combine_gff(self, prefixs, args_op): for prefix in prefixs: out_file = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "all_features.gff"])) print("Combine all features of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) utr5 = self.helper.get_correct_file( self.utr5_path, "_5UTR.gff", prefix, None, None) utr3 = self.helper.get_correct_file( self.utr3_path, "_3UTR.gff", prefix, None, None) if self.term_path is None: term = None else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) combine_gff(gff, tran, tss, utr5, utr3, term, args_op.tss_fuzzy, args_op.term_fuzzy, out_file) def run_operon(self, args_op): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op) if args_op.statistics: self._stat(self.table_path, args_op.stat_folder) if args_op.combine: self._combine_gff(prefixs, args_op) self.helper.remove_tmp(args_op.gffs) self.helper.remove_tmp(args_op.utr3s) self.helper.remove_tmp(args_op.utr5s) self.helper.remove_tmp(args_op.tsss) self.helper.remove_tmp(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp(args_op.terms)
class SubLocal(object): def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generate CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("transfer DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error:It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): if args_sub.merge: print("Merge to gff...") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp(args_sub.fastas) self.helper.remove_tmp(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now...") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now...") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format( args_sub.gram)) sys.exit() log.write("\t" + os.path.join(tmp_result, "_".join([ prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join(stat_folder, "_".join([ "stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class UTRDetection(object): def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics") def _check_folder(self, folder): if folder is None: print("Error: lack required files!!!") sys.exit() def _check_gff(self, folder): for gff in os.listdir(folder): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, gff)) def _compute_utr(self, args_utr): for gff in os.listdir(args_utr.gffs): if gff.endswith(".gff"): prefix = gff[:-4] tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) if args_utr.terms: term = self.helper.get_correct_file( os.path.join(args_utr.terms, "tmp"), "_term.gff", prefix, None, None) else: term = None print("computing 5'UTR of {0} .....".format(prefix)) detect_5utr(tss, os.path.join(args_utr.gffs, gff), tran, os.path.join(self.utr5_path, "gffs", "_".join([prefix, "5UTR.gff"])), args_utr) print("computing 3'UTR of {0} .....".format(prefix)) detect_3utr(tran, os.path.join(args_utr.gffs, gff), term, os.path.join(self.utr3_path, "gffs", "_".join([prefix, "3UTR.gff"])), args_utr) self.helper.move_all_content( os.getcwd(), self.utr5_stat_path, ["_5utr_length.png"]) self.helper.move_all_content( os.getcwd(), self.utr3_stat_path, ["_3utr_length.png"]) def run_utr_detection(self, args_utr): self._check_folder(args_utr.tsss) self._check_folder(args_utr.gffs) self._check_folder(args_utr.trans) self._check_gff(args_utr.tsss) self._check_gff(args_utr.gffs) self._check_gff(args_utr.trans) self._check_gff(args_utr.terms) self.multiparser.parser_gff(args_utr.gffs, None) self.multiparser.parser_gff(args_utr.tsss, "TSS") self.multiparser.combine_gff(args_utr.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_utr.trans, "transcript") self.multiparser.combine_gff(args_utr.gffs, self.tran_path, None, "transcript") if args_utr.terms: self.multiparser.parser_gff(args_utr.terms, "term") self.multiparser.combine_gff(args_utr.gffs, os.path.join(args_utr.terms, "tmp"), None, "term") self._compute_utr(args_utr) self.helper.remove_tmp(args_utr.gffs) self.helper.remove_tmp(args_utr.tsss) self.helper.remove_tmp(args_utr.trans) self.helper.remove_tmp(args_utr.terms) self.helper.remove_tmp(self.utr5_path) self.helper.remove_tmp(self.utr3_path)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)