class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([ os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([ os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([ samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam ]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([ samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam") ]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n" ]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile( os.path.join(splice_path, header, self.splices["file"]), os.path.join( fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile( os.path.join(splice_path, header, self.trans["file"]), os.path.join( fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and (self.splices["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and (self.trans["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_trans) else: shutil.move( os.path.join( fasta_prefix, "_".join( [self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move( os.path.join( fasta_prefix, "_".join( [self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir( os.path.join(self.splice_path, prefix)): detect_circrna( os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and (args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class TranscriptDetection(object): '''doing for transcript detection''' def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = { "gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join(self.gff_outfolder, "tmp_overlap") } self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcript for {0}".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join([strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join([strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of Transcript and TSS file") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join(["stat_compare_transcript_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join(args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) def _compare_cds(self, tas, args_tran): self.multiparser.parser_gff(args_tran.gffs, None) self.multiparser.combine_gff(self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"), "transcript", None) print("Comaring of Transcript and genome annotation") cds_folder = os.path.join(args_tran.gffs, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join( self.stat_path, "".join(["stat_compare_transcript_genome_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.gffs, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join(args_tran.gffs, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) def _compare_tss_cds(self, tas, args_tran): '''compare transcript with CDS and TSS''' if (args_tran.compare_tss is not None) and (args_tran.c_feature is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) self._compare_tss(tas, args_tran) elif (args_tran.c_feature is not None) and (args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) elif (args_tran.c_feature is None) and (args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran) def _for_one_wig(self, type_, args_tran): '''running transcript detection to one type of wig files''' if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Computing {0} wig files".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join( self.gff_outfolder, "_".join([strain, "transcript", type_ + ".gff"])) self.helper.sort_gff( os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove( os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran): '''merge the results of fragemented and tex treated libs''' if (args_tran.frag_wigs is not None) and (args_tran.tex_wigs is not None): print("Merging fragment and tex treat one") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "_transcript_" in gff: filename = gff.split("_transcript_") if (strain == filename[0]) and ("tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ("fragment.gff" == filename[1]): frag_file = gff combine( os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) def _post_modify(self, tas, args_tran): '''modify the transcript by comparing with genome annotation''' for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} refering to {1}".format(ta, gff)) fill_gap( os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"]) fill_gap( os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"]) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move( final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if "tmp_wig" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig")) self.helper.remove_tmp_dir(args_tran.gffs) self.helper.remove_tmp_dir(args_tran.compare_tss) self.helper.remove_tmp_dir(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran): '''searching the associated terminator to transcript''' if args_tran.terms is not None: print("Comparing between terminators and transcripts") self.multiparser.parser_gff(args_tran.terms, "term") if args_tran.gffs is not None: self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript", args_tran.terms, self.gff_outfolder) def run_transcript(self, args_tran): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): print("Error: There is no wigs files!!!!\n") sys.exit() if args_tran.frag_wigs is not None: strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran) tas = [] if args_tran.gffs is not None: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join(args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran) self._compare_term_tran(args_tran) print("Generating table for the details") gen_table_transcript(self.gff_outfolder, args_tran) plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist) self._remove_file(args_tran)
class TranscriptDetection(object): '''doing for transcript detection''' def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcripts for {0}".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join( [strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join( [strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of transcripts and TSSs") log.write("Running stat_TA_comparison.py to compare transcripts " "with TSSs.\n") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join([ "stat_compare_transcript_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join( args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) log.write("\t" + stat_tss_out + "\n") def _compare_cds(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.gffs, None) self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"), "transcript", None) print("Comaring of transcripts and genome annotations") cds_folder = os.path.join(args_tran.gffs, "tmp") log.write("Running stat_TA_comparison.py to compare transcripts " "with genome annotations.\n") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join(self.stat_path, "".join([ "stat_compare_transcript_genome_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.gffs, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join( args_tran.gffs, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) log.write("\t" + stat_gff_out + ".\n") def _compare_tss_cds(self, tas, args_tran, log): '''compare transcript with CDS and TSS''' if (args_tran.compare_tss is not None) and ( args_tran.c_feature is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) self._compare_tss(tas, args_tran, log) elif (args_tran.c_feature is not None) and ( args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) elif (args_tran.c_feature is None) and ( args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran, log) def _for_one_wig(self, type_, args_tran): '''running transcript detection to one type of wig files''' if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Importing {0} wig files".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join(self.gff_outfolder, "_".join([ strain, "transcript", type_ + ".gff"])) print(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) self.helper.sort_gff(os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran, log): '''merge the results of fragemented and tex treated libs''' if (args_tran.frag_wigs is not None) and ( args_tran.tex_wigs is not None): log.write("Running combine_frag_tex.py to merge the results from " "fragmented libs and dRNA-Seq libs.\n") print("Merging fragmented and tex treated ones") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "_transcript_" in gff: filename = gff.split("_transcript_") if (strain == filename[0]) and ( "tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ( "fragment.gff" == filename[1]): frag_file = gff combine(os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) log.write("\t" + final_gff + " is generated.\n") else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") def _post_modify(self, tas, args_tran): '''modify the transcript by comparing with genome annotation''' for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} by refering to {1}".format(ta, gff)) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"], args_tran.modify) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"], args_tran.modify) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move(final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if "tmp_wig" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig")) if "merge_wigs" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs")) self.helper.remove_tmp_dir(args_tran.gffs) self.helper.remove_tmp_dir(args_tran.compare_tss) self.helper.remove_tmp_dir(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran, log): '''searching the associated terminator to transcript''' if args_tran.terms is not None: print("Comparing between terminators and transcripts") self.multiparser.parser_gff(args_tran.terms, "term") if args_tran.gffs is not None: self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") log.write("Running compare_tran_term.py to compare transcripts " "with terminators.\n") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript", args_tran.terms, self.gff_outfolder) for file_ in os.listdir(os.path.join(args_tran.out_folder, "statistics")): if file_.startswith("stat_compare_transcript_terminator_"): log.write("\t" + file_ + " is generated.\n") def _re_table(self, args_tran, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for gff in os.listdir(self.gff_outfolder): if os.path.isfile(os.path.join(self.gff_outfolder, gff)): tran_table = os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")) reorganize_table(args_tran.libs, args_tran.merge_wigs, "Coverage_details", tran_table) log.write("\t" + tran_table + "\n") def _list_files(self, folder, log, end): log.write("The following files in {0} are generated:\n".format(folder)) for file_ in os.listdir(folder): if (end is not None) and (file_.endswith(end)): log.write("\t" + file_ + "\n") elif end is None: log.write("\t" + file_ + "\n") def run_transcript(self, args_tran, log): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): log.write("No wig file is assigned.\n") print("Error: There is no wiggle file!\n") sys.exit() if args_tran.frag_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on fragmented libs.\n") strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on dRNA-Seq libs.\n") strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran, log) tas = [] if "none" not in args_tran.modify: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join( args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) log.write("Running fill_gap.py to modify transcripts " "based on genome annotations.\n") self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran, log) self._compare_term_tran(args_tran, log) print("Generating tables for the details") log.write("Running gen_table_tran.py to generate the table of transcripts.\n") gen_table_transcript(self.gff_outfolder, args_tran) self._list_files(os.path.join(args_tran.out_folder, "tables"), log, None) log.write("Running plot_tran to plot the distribution of the length of " "the transcripts.\n") plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist) self._list_files(self.stat_path, log, ".png") self._re_table(args_tran, log) self._remove_file(args_tran)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf): if (args_sorf.gffs is None) or \ (args_sorf.trans is None) or \ ((args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: lack required files for UTR derived" " sORF detection!!!!") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf): '''detect the sORF based on start and stop codon and ribosome binding site''' for prefix in prefixs: if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"]))) def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf): '''compare transcript and CDS to find the intergenic region''' prefixs = [] for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcript and CDS of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo) return prefixs def run_sorf_detection(self, args_sorf): if args_sorf.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self._check_necessary_files(args_sorf) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf) self._start_stop_codon(prefixs, args_sorf) for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])), args_sorf.utr_detect) self._remove_tmp(args_sorf)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S"], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile(os.path.join(splice_path, header, self.splices["file"]), os.path.join(fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile(os.path.join(splice_path, header, self.trans["file"]), os.path.join(fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and ( self.splices["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and ( self.trans["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_trans) else: shutil.move(os.path.join( fasta_prefix, "_".join([self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move(os.path.join( fasta_prefix, "_".join([self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir(os.path.join( self.splice_path, prefix)): detect_circrna(os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join( self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join( self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and ( args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class MEME(object): def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): print(os.path.join(input_path, fasta)) folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: print(prefix) input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp(args_pro.fastas) self.helper.remove_tmp(args_pro.tsss) self.helper.remove_tmp(args_pro.gffs) self.helper.remove_tmp(args_pro.wigs) if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine): if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: for folder in os.listdir(os.path.join(output_folder, strain)): tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") gen_promoter_table(os.path.join(output_folder, strain, folder, "meme.txt"), os.path.join(output_folder, strain, folder, "meme.csv"), tss_file) def _get_upstream(self, args_pro, prefix, tss, fasta): if args_pro.source: print("generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro) else: if (args_pro.gffs is None) or ( args_pro.wigs is None) or ( args_pro.input_libs is None): print("Error:please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("classifying TSS and extracting fasta {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("generating the table...") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine) self._remove_files(args_pro)
class UTRDetection(object): def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics") def _check_folder(self, folder): if folder is None: print("Error: lack required files!!!") sys.exit() def _check_gff(self, folder): for gff in os.listdir(folder): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, gff)) def _compute_utr(self, args_utr): for gff in os.listdir(args_utr.gffs): if gff.endswith(".gff"): prefix = gff[:-4] tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) if args_utr.terms: term = self.helper.get_correct_file( os.path.join(args_utr.terms, "tmp"), "_term.gff", prefix, None, None) else: term = None print("computing 5'UTR of {0} .....".format(prefix)) detect_5utr(tss, os.path.join(args_utr.gffs, gff), tran, os.path.join(self.utr5_path, "gffs", "_".join([prefix, "5UTR.gff"])), args_utr) print("computing 3'UTR of {0} .....".format(prefix)) detect_3utr(tran, os.path.join(args_utr.gffs, gff), term, os.path.join(self.utr3_path, "gffs", "_".join([prefix, "3UTR.gff"])), args_utr) self.helper.move_all_content( os.getcwd(), self.utr5_stat_path, ["_5utr_length.png"]) self.helper.move_all_content( os.getcwd(), self.utr3_stat_path, ["_3utr_length.png"]) def run_utr_detection(self, args_utr): self._check_folder(args_utr.tsss) self._check_folder(args_utr.gffs) self._check_folder(args_utr.trans) self._check_gff(args_utr.tsss) self._check_gff(args_utr.gffs) self._check_gff(args_utr.trans) self._check_gff(args_utr.terms) self.multiparser.parser_gff(args_utr.gffs, None) self.multiparser.parser_gff(args_utr.tsss, "TSS") self.multiparser.combine_gff(args_utr.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_utr.trans, "transcript") self.multiparser.combine_gff(args_utr.gffs, self.tran_path, None, "transcript") if args_utr.terms: self.multiparser.parser_gff(args_utr.terms, "term") self.multiparser.combine_gff(args_utr.gffs, os.path.join(args_utr.terms, "tmp"), None, "term") self._compute_utr(args_utr) self.helper.remove_tmp(args_utr.gffs) self.helper.remove_tmp(args_utr.tsss) self.helper.remove_tmp(args_utr.trans) self.helper.remove_tmp(args_utr.terms) self.helper.remove_tmp(self.utr5_path) self.helper.remove_tmp(self.utr3_path)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class OperonDetection(object): '''detection of operon''' def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() if args_op.tsss is not None: self.tss_path = os.path.join(args_op.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_op.trans, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op, log): log.write("Running detect_operon.py to detect operon.\n") log.write("The the following files are generated:\n") for prefix in prefixs: out_gff = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "operon.gff"])) out_table = os.path.join(self.table_path, "_".join([prefix, "operon.csv"])) print("Detecting operons of {0}".format(prefix)) if self.tss_path is None: tss = False else: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table, out_gff) log.write("\t" + out_table + "\n") log.write("\t" + out_gff + "\n") def _check_and_parser_gff(self, args_op): self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") if args_op.tsss is not None: self._check_gff(args_op.tsss, "tss") self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder, log): log.write("Running stat_operon.py to do statistics.\n") for table in os.listdir(table_path): if table.endswith("_operon.csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) log.write("\t" + out_stat + "\n") def run_operon(self, args_op, log): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op, log) self._stat(self.table_path, args_op.stat_folder, log) self.helper.remove_tmp_dir(args_op.gffs) self.helper.remove_tmp_dir(args_op.tsss) self.helper.remove_tmp_dir(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp_dir(args_op.terms)
class TranscriptAssembly(object): def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_assembly_fragment.gff" self.tex = "transcript_assembly_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcript assembly for {0}...".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) assembly(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join( [strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join( [strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of Transcript assembly and TSS file...") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join([ "stat_compare_Transcriptome_assembly_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join( args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) def _compare_cds(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_cds, None) self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_cds, "tmp"), "transcript", None) print("Comaring of Transcript assembly and gene...") cds_folder = os.path.join(args_tran.compare_cds, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join(self.stat_path, "".join([ "stat_compare_Transcriptome_assembly_gene_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.compare_cds, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join( args_tran.compare_cds, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) def _compare_tss_cds(self, tas, args_tran): if (args_tran.compare_tss is not None) and ( args_tran.compare_cds is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) self._compare_tss(tas, args_tran) elif (args_tran.compare_cds is not None) and ( args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) elif (args_tran.compare_cds is None) and ( args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran) def _for_one_wig(self, type_, args_tran): if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Computing {0} wig files....".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join(self.gff_outfolder, "_".join([ strain, "transcript_assembly", type_ + ".gff"])) self.helper.sort_gff(os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran): if (args_tran.frag_wigs is not None) and ( args_tran.tex_wigs is not None): print("merge fragment and tex treat one ....") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "transcript_assembly" in gff: filename = gff.split("_transcript_assembly_") if (strain == filename[0]) and ( "tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ( "fragment.gff" == filename[1]): frag_file = gff combine(os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) def _post_modify(self, tas, args_tran): for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} refering to {1}...".format(ta, gff)) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"]) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"]) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move(final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if args_tran.frag_wigs is not None: self.helper.remove_wigs(args_tran.frag_wigs) if args_tran.tex_wigs is not None: self.helper.remove_wigs(args_tran.tex_wigs) if args_tran.gffs is not None: self.helper.remove_tmp(args_tran.gffs) if args_tran.compare_cds is not None: self.helper.remove_tmp(args_tran.compare_cds) if args_tran.compare_tss is not None: self.helper.remove_tmp(args_tran.compare_tss) if args_tran.terms is not None: self.helper.remove_tmp(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran): if args_tran.terms is not None: print("comparing between terminators and transcripts...") self.multiparser.parser_gff(args_tran.terms, "term") self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript") def run_transcript_assembly(self, args_tran): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): print("Error: there is no wigs files!!!!\n") sys.exit() if args_tran.frag_wigs is not None: strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran) tas = [] if args_tran.gffs is not None: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join( args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran) self._compare_term_tran(args_tran) gen_table_transcript(self.gff_outfolder, args_tran) self._remove_file(args_tran)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection( os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic( os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join( [prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join( args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue(os.path.exists( os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class UTRDetection(object): '''detection of UTR''' def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics") def _check_folder(self, folder): if folder is None: print("Error: Lack required files!!!") sys.exit() def _check_gff(self, folder): for gff in os.listdir(folder): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, gff)) def _compute_utr(self, args_utr): for gff in os.listdir(args_utr.gffs): if gff.endswith(".gff"): prefix = gff[:-4] tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) if args_utr.terms: term = self.helper.get_correct_file( os.path.join(args_utr.terms, "tmp"), "_term.gff", prefix, None, None) else: term = None print("Computing 5'UTR of {0}".format(prefix)) detect_5utr( tss, os.path.join(args_utr.gffs, gff), tran, os.path.join(self.utr5_path, "gffs", "_".join([prefix, "5UTR.gff"])), args_utr) print("Computing 3'UTR of {0}".format(prefix)) detect_3utr( tran, os.path.join(args_utr.gffs, gff), term, os.path.join(self.utr3_path, "gffs", "_".join([prefix, "3UTR.gff"])), args_utr) self.helper.move_all_content(os.getcwd(), self.utr5_stat_path, ["_5utr_length.png"]) self.helper.move_all_content(os.getcwd(), self.utr3_stat_path, ["_3utr_length.png"]) def run_utr_detection(self, args_utr): self._check_folder(args_utr.tsss) self._check_folder(args_utr.gffs) self._check_folder(args_utr.trans) self._check_gff(args_utr.tsss) self._check_gff(args_utr.gffs) self._check_gff(args_utr.trans) self._check_gff(args_utr.terms) self.multiparser.parser_gff(args_utr.gffs, None) self.multiparser.parser_gff(args_utr.tsss, "TSS") self.multiparser.combine_gff(args_utr.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_utr.trans, "transcript") self.multiparser.combine_gff(args_utr.gffs, self.tran_path, None, "transcript") if args_utr.terms: self.multiparser.parser_gff(args_utr.terms, "term") self.multiparser.combine_gff(args_utr.gffs, os.path.join(args_utr.terms, "tmp"), None, "term") self._compute_utr(args_utr) self.helper.remove_tmp_dir(args_utr.gffs) self.helper.remove_tmp_dir(args_utr.tsss) self.helper.remove_tmp_dir(args_utr.trans) self.helper.remove_tmp_dir(args_utr.terms) self.helper.remove_tmp(self.utr5_path) self.helper.remove_tmp(self.utr3_path)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class OperonDetection(object): '''detection of operon''' def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() if args_op.tsss is not None: self.tss_path = os.path.join(args_op.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_op.trans, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op, log): log.write("Running detect_operon.py to detect operon.\n") log.write("The the following files are generated:\n") for prefix in prefixs: out_gff = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "operon.gff"])) out_table = os.path.join(self.table_path, "_".join([prefix, "operon.csv"])) print("Detecting operons of {0}".format(prefix)) if self.tss_path is None: tss = False else: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table, out_gff) log.write("\t" + out_table + "\n") log.write("\t" + out_gff + "\n") def _check_and_parser_gff(self, args_op): self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") if args_op.tsss is not None: self._check_gff(args_op.tsss, "tss") self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder, log): log.write("Running stat_operon.py to do statistics.\n") for table in os.listdir(table_path): if table.endswith("_operon.csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) log.write("\t" + out_stat + "\n") def run_operon(self, args_op, log): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op, log) self._stat(self.table_path, args_op.stat_folder, log) self.helper.remove_tmp_dir(args_op.gffs) self.helper.remove_tmp_dir(args_op.tsss) self.helper.remove_tmp_dir(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp_dir(args_op.terms)
class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class sRNADetection(object): '''detection of sRNA''' def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") } def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([ os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file ], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or (database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): '''merge the results of fragmented and tex treated libs''' if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _read_lib_wig(self, args_srna): libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) return [libs, texs, wigs_f, wigs_r] def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): '''detection of intergenic and antisense sRNA''' tex_datas = None frag_datas = None if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if self.tss_path is not None: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file(self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) frag_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) tex_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if ("TSS_class" in os.listdir( args_srna.out_folder)) and (not args_srna.tss_source): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss, frag_datas, tex_datas def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas): '''detection of UTR-derived sRNA''' if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.import_info is not None: if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( args_srna.nr_database is not None) or (args_srna.srna_database is not None): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _merge_tex_frag_datas(self, tex_datas, frag_datas): if (tex_datas is not None) and (frag_datas is not None): for index in [2, 3]: for strain, conds in frag_datas[index].items(): if strain not in tex_datas[index].keys(): tex_datas[index][strain] = conds else: for cond, tracks in conds.items(): tex_datas[index][strain][cond] = tracks elif (tex_datas is None) and (frag_datas is not None): tex_datas = frag_datas return tex_datas def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gffs = { "merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix]) } csvs = { "merge": "_".join([self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([self.prefixs["normal_table"], prefix]) } tss, frag_datas, tex_datas = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas) tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas) del frag_datas gc.collect() self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss, tex_datas) del tex_datas filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss, tex_datas): print("merging data of sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join([ "cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file ])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): '''extract the sec str energy''' detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy( os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return { "sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path) } def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([ os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file) ])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([ os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file) ], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec(self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([ os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file ]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0} in {1}".format(prefix, database)) seq_file = os.path.join(args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): '''classify the sRNA based on the filters''' if (args_srna.import_info is not None) or (args_srna.srna_database is not None) or ( args_srna.nr_database is not None) or (self.sorf_path is not None) or ( self.tss_path is not None) or (self.term_path is not None) or ( args_srna.promoter_table is not None): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join( class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join(["stat_sRNA_class", prefix + ".csv"])) classify_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join(class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _get_best_result(self, prefixs, args_srna): '''get the best results based on the filters''' for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table( os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table, self.term_path) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) self.helper.remove_tmp(self.gff_output) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): '''set the filter of sRNA''' if args_srna.import_info is not None: if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if args_srna.nr_database is not None: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if self.sorf_path is not None: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) if args_srna.srna_database is not None: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): '''filter out the low expressed UTR-derived sRNA''' for prefix in prefixs: filter_utr( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): '''detection of antisense''' for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): '''do statistics for blast result''' for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): '''compare sRNA with terminator and promoter''' if self.term_path is not None: compare_srna_term( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if (args_srna.promoter_table is not None): compare_srna_promoter( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") if args_srna.import_info is not None: args_srna.import_info = self._import_info_format( args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter( "_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if args_srna.srna_database is not None: if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class Crispr(object): '''Detection of CRISPR''' def __init__(self, args_cris): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_cris.gffs, "tmp") self.fasta_path = os.path.join(args_cris.fastas, "tmp") self.stat_folder = os.path.join(args_cris.out_folder, "statistics") self.gff_out = os.path.join(args_cris.out_folder, "gffs") self.all_out = os.path.join(args_cris.out_folder, "gffs", "all_candidates") self.best_out = os.path.join(args_cris.out_folder, "gffs", "best") self.helper.check_make_folder(self.all_out) self.helper.check_make_folder(self.best_out) self.data_folder = os.path.join(args_cris.out_folder, "CRT_output") self.helper.check_make_folder(self.data_folder) self.helper.check_make_folder(self.stat_folder) def _run_crt(self, args_cris): '''Running CRT''' print("Running CRT") for seq in os.listdir(self.fasta_path): prefix = ".".join(seq.split(".")[:-1]) call([ "java", "-cp", args_cris.crt_path, "crt", "-minNR", str(args_cris.min_num_r), "-minRL", str(args_cris.min_len_r), "-maxRL", str(args_cris.max_len_r), "-minSL", str(args_cris.min_len_s), "-maxSL", str(args_cris.max_len_s), "-searchWL", str(args_cris.win_size), os.path.join(self.fasta_path, seq), os.path.join(self.data_folder, prefix + ".txt") ]) def _read_gff(self, txt): gffs = [] gh = open(os.path.join(self.gff_path, txt.replace(".txt", ".gff")), "r") for entry in Gff3Parser().entries(gh): if (entry.feature == "gene") or (entry.feature == "CDS") or ( entry.feature == "tRNA") or (entry.feature == "rRNA"): gffs.append(entry) gh.close() return gffs def _compare_gff(self, strain, start, end, gffs, bh, indexs, ignore_hypo): '''Compare CRISPR and genome annotation to remove the false positives''' overlap = False id_ = None for gff in gffs: if (gff.seq_id == strain): if ((gff.start <= start) and (gff.end >= end)) or ( (gff.start >= start) and (gff.end <= end)) or ( (gff.start <= start) and (gff.end > start) and (gff.end <= end)) or ((gff.start >= start) and (gff.start < end) and (gff.end >= end)): if "product" in gff.attributes.keys(): if ((not ignore_hypo) and ("hypothetical protein" in gff.attributes["product"])) or ( "hypothetical protein" not in gff.attributes["product"]): overlap = True if not overlap: id_ = "CRISPR_" + str(indexs["best"]) attribute = ";".join(["ID=" + strain + "_" + id_, "method=CRT"]) bh.write("\t".join([ strain, "ANNOgesic", "CRISPR", str(start), str(end), ".", ".", ".", attribute ]) + "\n") indexs["best"] += 1 return overlap, id_ def _print_repeat(self, row, strain, file_h, indexs, id_, best): '''Print the repeat units''' if best: num = indexs["re_best"] else: num = indexs["re_all"] if (not row[0].startswith("-")) and ( not row[0].startswith("Repeats:")) and ( not row[0].startswith("CRISPR")) and ( not row[0].startswith("POSITION")): start = row[0].strip() end = str(int(start) + len(row[2].strip()) - 1) attribute = ";".join([ "ID=" + strain + "_Repeat_" + str(num), "method=CRT", "Parent=" + id_ ]) file_h.write("\t".join([ strain, "ANNOgesic", "repeat_unit", start, end, ".", ".", ".", attribute ]) + "\n") num += 1 if row[0].startswith("Repeats:"): indexs["run"] = False return num def _convert_gff(self, ignore_hypo): '''Convert the final CRT output to gff format''' for txt in os.listdir(self.data_folder): gffs = self._read_gff(txt) fh = open(os.path.join(self.data_folder, txt), "r") oh = open( os.path.join(self.all_out, txt.replace(".txt", "_CRISPR.gff")), "w") bh = open( os.path.join(self.best_out, txt.replace(".txt", "_CRISPR.gff")), "w") indexs = { "all": 0, "re_all": 0, "best": 0, "re_best": 0, "run": False } for row in csv.reader(fh, delimiter='\t'): if len(row) != 0: if row[0].startswith("ORGANISM:"): strain = row[0].split(" ")[-1] elif row[0].startswith("CRISPR"): end = row[0].split("-")[-1].strip() start = row[0].split("-")[0].split(":")[-1].strip() id_ = "CRISPR_" + str(indexs["all"]) attribute = ";".join( ["ID=" + strain + "_" + id_, "method=CRT"]) oh.write("\t".join([ strain, "ANNOgesic", "CRISPR", start, end, ".", ".", ".", attribute ]) + "\n") overlap, over_id = self._compare_gff( strain, int(start), int(end), gffs, bh, indexs, ignore_hypo) indexs["all"] += 1 indexs["run"] = True if indexs["run"]: indexs["re_all"] = self._print_repeat( row, strain, oh, indexs, id_, False) if not overlap: indexs["re_best"] = self._print_repeat( row, strain, bh, indexs, over_id, True) fh.close() oh.close() bh.close() def _stat_and_correct(self, stats, folder): '''do statistics and print the final gff file''' for gff in os.listdir(folder): prefix = gff.replace("_CRISPR.gff", "") stats[prefix] = {"all": {"cri": 0, "re": {}}} gh = open(os.path.join(folder, gff), "r") oh = open("tmp_cri.gff", "w") oh.write("##gff-version 3\n") cr_num = 0 re_num = 0 first = True for entry in Gff3Parser().entries(gh): if entry.seq_id not in stats[prefix].keys(): stats[prefix][entry.seq_id] = {"cri": 0, "re": {}} if entry.feature == "CRISPR": id_ = "CRISPR_" + str(cr_num) attribute = ";".join( ["ID=" + entry.seq_id + "_" + id_, "method=CRT"]) cr_num += 1 if first: first = False else: if repeat not in stats[prefix][ entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 repeat = 0 stats[prefix][entry.seq_id]["cri"] += 1 stats[prefix]["all"]["cri"] += 1 elif entry.feature == "repeat_unit": attribute = ";".join([ "ID=" + entry.seq_id + "_Repeat_" + str(re_num), "method=CRT", "Parent=" + id_ ]) re_num += 1 repeat += 1 oh.write( "\t".join([entry.info_without_attributes, attribute]) + "\n") if not first: if repeat not in stats[prefix][entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 gh.close() oh.close() os.remove(os.path.join(folder, gff)) shutil.move("tmp_cri.gff", os.path.join(folder, gff)) def _print_file(self, sh, cri_res_all, cri_res_best): sh.write("\tthe number of CRISPR - {0}\n".format(cri_res_all["cri"])) for index, num in cri_res_all["re"].items(): sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format( index, num)) sh.write("\tthe number of CRISPR which not overlap " "with genome annotation - {0}\n".format(cri_res_best["cri"])) for index, num in cri_res_best["re"].items(): sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format( index, num)) def _print_stat(self, stats): '''print the statistics file''' for prefix, strains in stats["all"].items(): sh = open(os.path.join(self.stat_folder, prefix + ".csv"), "w") if len(strains) == 1: sh.write("No CRISPR can be detected") elif len(strains) <= 2: for strain, cri_res in strains.items(): if strain != "all": sh.write(strain + ":\n") self._print_file(sh, cri_res, stats["best"][prefix][strain]) else: sh.write("All strains:\n") self._print_file(sh, stats["all"][prefix]["all"], stats["best"][prefix]["all"]) for strain, cri_res in strains.items(): if strain != "all": sh.write(strain + ":\n") if strain not in stats["best"][prefix].keys(): stats["best"][prefix][strain] = { "cri": 0, "re": {} } self._print_file(sh, cri_res, stats["best"][prefix][strain]) sh.close() def run_crispr(self, args_cris): '''detection of CRISPR''' self.multiparser.parser_fasta(args_cris.fastas) self.multiparser.parser_gff(args_cris.gffs, None) self._run_crt(args_cris) self._convert_gff(args_cris.ignore_hypo) print("All candidates:") self.multiparser.combine_gff(args_cris.gffs, self.all_out, None, "CRISPR") print("Best candidates:") self.multiparser.combine_gff(args_cris.gffs, self.best_out, None, "CRISPR") stats = {"all": {}, "best": {}} self._stat_and_correct(stats["all"], self.all_out) self._stat_and_correct(stats["best"], self.best_out) self._print_stat(stats) self.helper.remove_tmp_dir(args_cris.gffs) self.helper.remove_tmp_dir(args_cris.fastas)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if (pre_strain != strain): num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain == 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({ "sample": reads["sample"], "files": tmp_datas, "zips": zips }) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) + "\n") call([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ]) + "\n") p = Popen([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write( "Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join( [read_prefix, fasta_prefix + ".sam"]) log_file = "_".join( [read_prefix, fasta_prefix + ".log"]) align_files.append("_".join( [read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir( os.path.join(self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write( " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write( "Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write( " ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join( [samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join( [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) + "\n") call([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join(read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write( "Please make sure the version of Segemehl is at least 0.1.9.\n") log.write( "Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed") ]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join(output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join(splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join(splice_path, header, splice), os.path.join(bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.splices["file"] ])) out_trans = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write( "Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder( os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join(args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join( args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join(self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({ "sample": datas[0], "files": datas[-1].split(",") }) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix + ".bam")) bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed(args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class OperonDetection(object): def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op): for prefix in prefixs: out_table = os.path.join(self.table_path, "_".join(["operon", prefix + ".csv"])) print("Detection operons of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table) def _check_and_parser_gff(self, args_op): self._check_gff(args_op.tsss, "tss") self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self._check_gff(args_op.utr5s, "utr") self._check_gff(args_op.utr3s, "utr") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") self.multiparser.parser_gff(args_op.utr5s, "5UTR") self.multiparser.combine_gff(args_op.gffs, self.utr5_path, None, "5UTR") self.multiparser.parser_gff(args_op.utr3s, "3UTR") self.multiparser.combine_gff(args_op.gffs, self.utr3_path, None, "3UTR") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder): for table in os.listdir(table_path): if table.startswith("operon_") and table.endswith(".csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) def _combine_gff(self, prefixs, args_op): for prefix in prefixs: out_file = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "all_features.gff"])) print("Combine all features of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) utr5 = self.helper.get_correct_file( self.utr5_path, "_5UTR.gff", prefix, None, None) utr3 = self.helper.get_correct_file( self.utr3_path, "_3UTR.gff", prefix, None, None) if self.term_path is None: term = None else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) combine_gff(gff, tran, tss, utr5, utr3, term, args_op.tss_fuzzy, args_op.term_fuzzy, out_file) def run_operon(self, args_op): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op) if args_op.statistics: self._stat(self.table_path, args_op.stat_folder) if args_op.combine: self._combine_gff(prefixs, args_op) self.helper.remove_tmp(args_op.gffs) self.helper.remove_tmp(args_op.utr3s) self.helper.remove_tmp(args_op.utr5s) self.helper.remove_tmp(args_op.tsss) self.helper.remove_tmp(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp(args_op.terms)
class OperonDetection(object): def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op): for prefix in prefixs: out_table = os.path.join(self.table_path, "_".join(["operon", prefix + ".csv"])) print("Detection operons of {0}".format(prefix)) tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table) def _check_and_parser_gff(self, args_op): self._check_gff(args_op.tsss, "tss") self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self._check_gff(args_op.utr5s, "utr") self._check_gff(args_op.utr3s, "utr") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") self.multiparser.parser_gff(args_op.utr5s, "5UTR") self.multiparser.combine_gff(args_op.gffs, self.utr5_path, None, "5UTR") self.multiparser.parser_gff(args_op.utr3s, "3UTR") self.multiparser.combine_gff(args_op.gffs, self.utr3_path, None, "3UTR") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder): for table in os.listdir(table_path): if table.startswith("operon_") and table.endswith(".csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) def _combine_gff(self, prefixs, args_op): for prefix in prefixs: out_file = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "all_features.gff"])) print("Combine all features of {0}".format(prefix)) tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) utr5 = self.helper.get_correct_file(self.utr5_path, "_5UTR.gff", prefix, None, None) utr3 = self.helper.get_correct_file(self.utr3_path, "_3UTR.gff", prefix, None, None) if self.term_path is None: term = None else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) combine_gff(gff, tran, tss, utr5, utr3, term, args_op.tss_fuzzy, args_op.term_fuzzy, out_file) def run_operon(self, args_op): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op) if args_op.statistics: self._stat(self.table_path, args_op.stat_folder) if args_op.combine: self._combine_gff(prefixs, args_op) self.helper.remove_tmp(args_op.gffs) self.helper.remove_tmp(args_op.utr3s) self.helper.remove_tmp(args_op.utr5s) self.helper.remove_tmp(args_op.tsss) self.helper.remove_tmp(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp(args_op.terms)
class TestMultiparser(unittest.TestCase): def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder) def tearDown(self): if os.path.exists(self.ref_folder): shutil.rmtree(self.ref_folder) if os.path.exists(self.tar_folder): shutil.rmtree(self.tar_folder) def test_combine_fasta(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.gff_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_tar, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_tar, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_ref, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_ref, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa"))) def test_combine_wig(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig") sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig") sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig") sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig") wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4] example_wigs = [ self.example.sub_f_wig1, self.example.sub_f_wig2, self.example.sub_r_wig1, self.example.sub_r_wig2 ] for index in range(0, 4): with open(wig_files[index], "w") as fh: fh.write(example_wigs[index]) libs = [ "test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-" ] self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_forward.wig"))) self.assertTrue( os.path.exists(os.path.join(tmp_tar, "test_reverse.wig"))) def test_combine_gff(self): tmp_tar = os.path.join(self.tar_folder, "tmp") tmp_ref = os.path.join(self.ref_folder, "test.fa_folder") os.mkdir(tmp_ref) os.mkdir(tmp_tar) sub_fasta1 = os.path.join(tmp_ref, "aaa.fa") with open(sub_fasta1, "w") as rh: rh.write(self.example.sub_fasta1) sub_fasta2 = os.path.join(tmp_ref, "bbb.fa") with open(sub_fasta2, "w") as rh: rh.write(self.example.sub_fasta2) sub_gff1 = os.path.join(tmp_tar, "aaa.gff") with open(sub_gff1, "w") as rh: rh.write(self.example.sub_gff1) sub_gff2 = os.path.join(tmp_tar, "bbb.gff") with open(sub_gff2, "w") as rh: rh.write(self.example.sub_gff2) self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None) self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff"))) def test_parser_fasta(self): fasta_file = os.path.join(self.ref_folder, "test.fa") with open(fasta_file, "w") as rh: rh.write(self.example.fasta_file) self.multiparser.parser_fasta(self.ref_folder) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/aaa.fa"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.fa_folder/bbb.fa"))) def test_parser_gff(self): gff_file = os.path.join(self.ref_folder, "test.gff") with open(gff_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, None) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/aaa.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test.gff_folder/bbb.gff"))) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") os.rename(gff_file, tss_file) tss_file = os.path.join(self.ref_folder, "test_TSS.gff") with open(tss_file, "w") as rh: rh.write(self.example.gff_file) self.multiparser.parser_gff(self.ref_folder, "TSS") self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff"))) self.assertTrue( os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff"))) def test_parser_wig(self): wig_f_file = os.path.join(self.ref_folder, "test_forward.wig") with open(wig_f_file, "w") as rh: rh.write(self.example.wig_f_file) wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig") with open(wig_r_file, "w") as rh: rh.write(self.example.wig_r_file) self.multiparser.parser_wig(self.ref_folder) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig"))) self.assertTrue( os.path.exists( os.path.join( self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([ prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = { "pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa") } self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) if (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join([ "promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt" ]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) elif (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open( os.path.join( input_path, "".join([ filename[0], strain, filename[-1] ])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove( os.path.join( input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir( os.path.join(args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder( os.path.join(args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join(self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program): '''generate the promoter table''' if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or (program.lower() == "meme"): for folder in os.listdir( os.path.join(output_folder, strain, "MEME")): gen_promoter_table( os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), os.path.join(output_folder, strain, "MEME", folder, "meme.csv"), tss_file, "meme") if (program.lower() == "both") or (program.lower() == "glam2"): for folder in os.listdir( os.path.join(output_folder, strain, "GLAM2")): gen_promoter_table( os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv"), tss_file, "glam2") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None) or (args_pro.tex_wigs is None) or ( args_pro.input_libs is None): print("Error: Please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("Classifying TSS and extracting fasta {0}".format(prefix)) upstream( os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree( os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder( os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("Generating the table") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program) self._remove_files(args_pro)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({"sample": reads["sample"], "files": tmp_datas, "zips": zips}) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) + "\n") call([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"]) + "\n") p = Popen([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write("Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir(os.path.join( self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write(" ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join(out_folder, "_".join([ prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write(" ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join([samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join(out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join([samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n") call([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file( self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join( read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({"sample": read_data["sample"], "files": bam_files}) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write("Please make sure the version of Segemehl is at least 0.1.9.\n") log.write("Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed")]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join( output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join( splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join( splice_path, header, splice), os.path.join( bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.splices["file"]])) out_trans = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write("Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join( args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join(args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join( self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({"sample": datas[0], "files": datas[-1].split(",")}) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix + ".bam")) bam_datas.append({"sample": read_data["sample"], "files": bam_files}) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed( args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)