def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder)
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") }
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") }
def convert2gff(out_path, gff_files, args_ops): for core in range(1, args_ops.cores+1): output_folder = os.path.join( out_path, "_".join(["MasterTable", str(core)])) gff_file = os.path.join( output_folder, "_".join(["TSSpredator", str(core) + ".gff"])) Converter().convert_mastertable2gff( os.path.join(output_folder, "MasterTable.tsv"), "TSSpredator", args_ops.program, args_ops.project_strain, gff_file) gff_files.append(gff_file)
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def get_input(self): """Download required files from website.""" print("Running get input files...") if self._args.FTP_path is None: print("Error: Please assign the path for downloading the data!!") sys.exit() if self._args.for_target: annotation_folder = self._paths.tar_annotation_folder fasta_folder = self._paths.tar_fasta_folder else: annotation_folder = self._paths.ref_annotation_folder fasta_folder = self._paths.ref_fasta_folder self.helper.check_make_folder(annotation_folder) self.helper.check_make_folder(fasta_folder) if self._args.ref_gff is True: get_file(self._args.FTP_path, annotation_folder, "gff", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "_genomic.gff.gz", self._args.for_target) if self._args.ref_fasta is True: get_file(self._args.FTP_path, fasta_folder, "fna", self._args.for_target) get_file(self._args.FTP_path, fasta_folder, "_genomic.fna.gz", self._args.for_target) if self._args.ref_gbk is True: get_file(self._args.FTP_path, annotation_folder, "gbk", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "gbff", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "_genomic.gbff.gz", self._args.for_target) if self._args.ref_ptt is True: get_file(self._args.FTP_path, annotation_folder, "ptt", self._args.for_target) if self._args.ref_rnt is True: get_file(self._args.FTP_path, annotation_folder, "rnt", self._args.for_target) if self._args.convert_embl is True: annotation_files = os.listdir(annotation_folder) if len(annotation_files) == 0: sys.stdout.write("No gbk files!!\n") else: Converter().convert_gbk2embl(annotation_folder)
def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff"
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")}
def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join( out_folder, "tmp_specific"), "nospecific": os.path.join( out_folder, "tmp_nospecific"), "wget_action": os.path.join( out_folder, "tmp_action")}
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbk) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") self._remove_files(args_ratt, out_gbk)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class PPINetwork(object): '''detection of PPI''' def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") } def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(2) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir( files["id_list"])) and ("all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ("http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1 ) and (row[0] != "Location") and (name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and (strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder(paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([ strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv" ]) files["all_nospecific"] = open( os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open( os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder("_".join( [self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join(self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join(self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join(self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder(paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join([ strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv" ]) files["all_specific"] = open( os.path.join(paths["all"], filename_strain), "w") files["best_specific"] = open( os.path.join(paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open( os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist( os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget(action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): '''get the interaction of proteins''' for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = { "specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True } print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or (row_a[1] != pre_row[1]): self._get_pubmed(pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed(row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi( os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi( os.path.join( self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): '''retrieve PPI from STRING with PIE and draw network''' strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({ "file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3] }) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S"], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile(os.path.join(splice_path, header, self.splices["file"]), os.path.join(fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile(os.path.join(splice_path, header, self.trans["file"]), os.path.join(fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and ( self.splices["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and ( self.trans["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_trans) else: shutil.move(os.path.join( fasta_prefix, "_".join([self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move(os.path.join( fasta_prefix, "_".join([self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir(os.path.join( self.splice_path, prefix)): detect_circrna(os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join( self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join( self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and ( args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class(tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join(self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format( args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write( "--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n" ) print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write( "Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or (not os.path.exists( os.path.join(self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists( os.path.join(self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write( "The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ]) + "\n") call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write( "Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class PPINetwork(object): def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join( out_folder, "tmp_specific"), "nospecific": os.path.join( out_folder, "tmp_nospecific"), "wget_action": os.path.join( out_folder, "tmp_action")} def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(1) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir(files["id_list"])) and ( "all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join( [ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write("\t".join( [ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open(os.path.join( best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove(os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1) and (row[0] != "Location") and ( name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and ( strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder( paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv"]) files["all_nospecific"] = open(os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open(os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder( "_".join([self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join( self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join( self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join( self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder( paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join( [strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv"]) files["all_specific"] = open(os.path.join( paths["all"], filename_strain), "w") files["best_specific"] = open(os.path.join( paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open(os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist(os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist(os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget( action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = {"specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True} print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or ( row_a[1] != pre_row[1]): self._get_pubmed( pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed( row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({"file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3]}) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class( tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt( gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join( self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([ os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([ os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([ samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam ]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([ samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam") ]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n" ]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile( os.path.join(splice_path, header, self.splices["file"]), os.path.join( fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile( os.path.join(splice_path, header, self.trans["file"]), os.path.join( fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and (self.splices["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and (self.trans["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_trans) else: shutil.move( os.path.join( fasta_prefix, "_".join( [self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move( os.path.join( fasta_prefix, "_".join( [self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir( os.path.join(self.splice_path, prefix)): detect_circrna( os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and (args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({"sample": reads["sample"], "files": tmp_datas, "zips": zips}) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) + "\n") call([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"]) + "\n") p = Popen([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write("Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir(os.path.join( self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write(" ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join(out_folder, "_".join([ prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write(" ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join([samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join(out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join([samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n") call([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file( self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join( read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({"sample": read_data["sample"], "files": bam_files}) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write("Please make sure the version of Segemehl is at least 0.1.9.\n") log.write("Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed")]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join( output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join( splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join( splice_path, header, splice), os.path.join( bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.splices["file"]])) out_trans = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write("Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join( args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join(args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join( self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({"sample": datas[0], "files": datas[-1].split(",")}) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix + ".bam")) bam_datas.append({"sample": read_data["sample"], "files": bam_files}) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed( args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({ "sample": reads["sample"], "files": tmp_datas, "zips": zips }) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) + "\n") call([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ]) + "\n") p = Popen([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write( "Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join( [read_prefix, fasta_prefix + ".sam"]) log_file = "_".join( [read_prefix, fasta_prefix + ".log"]) align_files.append("_".join( [read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir( os.path.join(self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write( " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write( "Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write( " ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join( [samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join( [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) + "\n") call([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join(read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write( "Please make sure the version of Segemehl is at least 0.1.9.\n") log.write( "Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed") ]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join(output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join(splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join(splice_path, header, splice), os.path.join(bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.splices["file"] ])) out_trans = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write( "Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder( os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join(args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join( args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join(self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({ "sample": datas[0], "files": datas[-1].split(",") }) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix + ".bam")) bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed(args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)