class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class(tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join(self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbk) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") self._remove_files(args_ratt, out_gbk)
class PPINetwork(object): def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join( out_folder, "tmp_specific"), "nospecific": os.path.join( out_folder, "tmp_nospecific"), "wget_action": os.path.join( out_folder, "tmp_action")} def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(1) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir(files["id_list"])) and ( "all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join( [ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write("\t".join( [ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open(os.path.join( best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove(os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1) and (row[0] != "Location") and ( name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and ( strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder( paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv"]) files["all_nospecific"] = open(os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open(os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder( "_".join([self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join( self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join( self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join( self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder( paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join( [strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv"]) files["all_specific"] = open(os.path.join( paths["all"], filename_strain), "w") files["best_specific"] = open(os.path.join( paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open(os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist(os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist(os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget( action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = {"specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True} print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or ( row_a[1] != pre_row[1]): self._get_pubmed( pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed( row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({"file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3]}) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class PPINetwork(object): '''detection of PPI''' def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") } def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(2) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir( files["id_list"])) and ("all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ("http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1 ) and (row[0] != "Location") and (name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and (strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder(paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([ strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv" ]) files["all_nospecific"] = open( os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open( os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder("_".join( [self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join(self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join(self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join(self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder(paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join([ strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv" ]) files["all_specific"] = open( os.path.join(paths["all"], filename_strain), "w") files["best_specific"] = open( os.path.join(paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open( os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist( os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget(action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): '''get the interaction of proteins''' for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = { "specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True } print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or (row_a[1] != pre_row[1]): self._get_pubmed(pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed(row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi( os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi( os.path.join( self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): '''retrieve PPI from STRING with PIE and draw network''' strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({ "file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3] }) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format( args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write( "--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n" ) print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write( "Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or (not os.path.exists( os.path.join(self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists( os.path.join(self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write( "The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ]) + "\n") call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write( "Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class( tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt( gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join( self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)