class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf): if (args_sorf.gffs is None) or \ (args_sorf.trans is None) or \ ((args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: lack required files for UTR derived" " sORF detection!!!!") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf): '''detect the sORF based on start and stop codon and ribosome binding site''' for prefix in prefixs: if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"]))) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"]))) def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf): '''compare transcript and CDS to find the intergenic region''' prefixs = [] for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcript and CDS of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo) return prefixs def run_sorf_detection(self, args_sorf): if args_sorf.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self._check_necessary_files(args_sorf) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf) self._start_stop_codon(prefixs, args_sorf) for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])), args_sorf.utr_detect) self._remove_tmp(args_sorf)
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = {"fasta": os.path.join( args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join( args_ribo.out_folder, "tmp_scan"), "table": os.path.join( args_ribo.out_folder, "tmp_table")} self.suffixs = {"csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv"} def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff(os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join( self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join(os.path.join( self.tmp_files["table"], "_".join([gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = { "pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa") } self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) if (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join([ "promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt" ]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) elif (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open( os.path.join( input_path, "".join([ filename[0], strain, filename[-1] ])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove( os.path.join( input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir( os.path.join(args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder( os.path.join(args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join(self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program): '''generate the promoter table''' if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or (program.lower() == "meme"): for folder in os.listdir( os.path.join(output_folder, strain, "MEME")): gen_promoter_table( os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), os.path.join(output_folder, strain, "MEME", folder, "meme.csv"), tss_file, "meme") if (program.lower() == "both") or (program.lower() == "glam2"): for folder in os.listdir( os.path.join(output_folder, strain, "GLAM2")): gen_promoter_table( os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv"), tss_file, "glam2") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None) or (args_pro.tex_wigs is None) or ( args_pro.input_libs is None): print("Error: Please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("Classifying TSS and extracting fasta {0}".format(prefix)) upstream( os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree( os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder( os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("Generating the table") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program) self._remove_files(args_pro)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([ prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection( os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move( os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic( os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join( [prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join( args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class UTRDetection(object): def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics") def _check_folder(self, folder): if folder is None: print("Error: lack required files!!!") sys.exit() def _check_gff(self, folder): for gff in os.listdir(folder): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, gff)) def _compute_utr(self, args_utr): for gff in os.listdir(args_utr.gffs): if gff.endswith(".gff"): prefix = gff[:-4] tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) if args_utr.terms: term = self.helper.get_correct_file( os.path.join(args_utr.terms, "tmp"), "_term.gff", prefix, None, None) else: term = None print("computing 5'UTR of {0} .....".format(prefix)) detect_5utr(tss, os.path.join(args_utr.gffs, gff), tran, os.path.join(self.utr5_path, "gffs", "_".join([prefix, "5UTR.gff"])), args_utr) print("computing 3'UTR of {0} .....".format(prefix)) detect_3utr(tran, os.path.join(args_utr.gffs, gff), term, os.path.join(self.utr3_path, "gffs", "_".join([prefix, "3UTR.gff"])), args_utr) self.helper.move_all_content( os.getcwd(), self.utr5_stat_path, ["_5utr_length.png"]) self.helper.move_all_content( os.getcwd(), self.utr3_stat_path, ["_3utr_length.png"]) def run_utr_detection(self, args_utr): self._check_folder(args_utr.tsss) self._check_folder(args_utr.gffs) self._check_folder(args_utr.trans) self._check_gff(args_utr.tsss) self._check_gff(args_utr.gffs) self._check_gff(args_utr.trans) self._check_gff(args_utr.terms) self.multiparser.parser_gff(args_utr.gffs, None) self.multiparser.parser_gff(args_utr.tsss, "TSS") self.multiparser.combine_gff(args_utr.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_utr.trans, "transcript") self.multiparser.combine_gff(args_utr.gffs, self.tran_path, None, "transcript") if args_utr.terms: self.multiparser.parser_gff(args_utr.terms, "term") self.multiparser.combine_gff(args_utr.gffs, os.path.join(args_utr.terms, "tmp"), None, "term") self._compute_utr(args_utr) self.helper.remove_tmp(args_utr.gffs) self.helper.remove_tmp(args_utr.tsss) self.helper.remove_tmp(args_utr.trans) self.helper.remove_tmp(args_utr.terms) self.helper.remove_tmp(self.utr5_path) self.helper.remove_tmp(self.utr3_path)
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class OperonDetection(object): '''detection of operon''' def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() if args_op.tsss is not None: self.tss_path = os.path.join(args_op.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_op.trans, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op, log): log.write("Running detect_operon.py to detect operon.\n") log.write("The the following files are generated:\n") for prefix in prefixs: out_gff = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "operon.gff"])) out_table = os.path.join(self.table_path, "_".join([prefix, "operon.csv"])) print("Detecting operons of {0}".format(prefix)) if self.tss_path is None: tss = False else: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table, out_gff) log.write("\t" + out_table + "\n") log.write("\t" + out_gff + "\n") def _check_and_parser_gff(self, args_op): self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") if args_op.tsss is not None: self._check_gff(args_op.tsss, "tss") self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder, log): log.write("Running stat_operon.py to do statistics.\n") for table in os.listdir(table_path): if table.endswith("_operon.csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) log.write("\t" + out_stat + "\n") def run_operon(self, args_op, log): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op, log) self._stat(self.table_path, args_op.stat_folder, log) self.helper.remove_tmp_dir(args_op.gffs) self.helper.remove_tmp_dir(args_op.tsss) self.helper.remove_tmp_dir(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp_dir(args_op.terms)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and (cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and (cds.end > ta.start) and (cds.start <= ta.start)) or ( (cds.start > ta.start) and (cds.start < ta.end) and (cds.end >= ta.end)) or ( (cds.end >= ta.end) and (cds.start <= ta.start)) or ( (cds.end <= ta.end) and (cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran( os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open( os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format(args_sub.gram)) sys.exit() log.write( "\t" + os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb( os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join( [prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join([ "#Genome", "Protein", "Strand", "Start", "End", "Location", "Score" ]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join( stat_folder, "_".join(["stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class GoTermFinding(object): '''Retrieving the GO term''' def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_, log): prefixs = [] log.write("Running gene_ontology.py to retrieve GO terms.\n") for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("Extracting GO terms of {0} from UniProt".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) log.write("\t" + out_file + " is generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join( ["Genome", "Strand", "Start", "End", "Protein_id", "Go_term"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_files(self, gffs, out_path, out_folder, log): '''merge the files according to the input genome folder''' folders = [] log.write("Merging the output files based on the input genome " "information.\n") for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file( os.path.join(out_path, filename, csv_file), out_all) self._remove_header(out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile( os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) for file_ in os.listdir(os.path.join(out_path, folder_prefix)): log.write("\t" + os.path.join(out_path, folder_prefix, file_) + " is generated.\n") def _stat(self, out_path, stat_path, go, goslim, out_folder, log): log.write("Running gene_ontology.py to Retrieve GOslim terms and " "do statistics.\n") log.write("The following files are generated:\n") for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) stat_file = os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), stat_file, out_folder) log.write("\t" + stat_file + "\n") self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) for file_ in os.listdir(fig_path): log.write("\t" + os.path.join(fig_path, file_) + "\n") def run_go_term(self, args_go, log): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDSs") log.write("Retrieving GO terms for all CDSs.\n") self._retrieve_go(args_go.uniprot, self.result_all_path, "all", log) self._merge_files(args_go.gffs, self.result_all_path, self.out_all, log) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all, log) if args_go.trans is not None: log.write("Retrieving GO terms only for expressed CDSs.\n") print("Computing express CDSs") self._retrieve_go(args_go.uniprot, self.result_express_path, "express", log) self._merge_files(args_go.gffs, self.result_express_path, self.out_express, log) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express, log) self.helper.remove_tmp_dir(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp_dir(args_go.trans)
class GoTermFinding(object): def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDS") self.out_express = os.path.join(args_go.out_folder, "expressed_CDS") self.result_all_path = os.path.join(self.out_all, "Go_term_results") self.result_express_path = os.path.join(self.out_express, "Go_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_strains_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_): prefixs = [] for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("extracting Go terms of {0} from UniProt...".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) def _merge_files(self, gffs, out_path, out_folder): folders = [] for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) def _stat(self, out_path, stat_path, go, goslim, out_folder): for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) print("Computing statistics of {0}".format(folder)) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])), out_folder) self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) def run_go_term(self, args_go): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDS...") self._retrieve_go(args_go.uniprot, self.result_all_path, "all") self._merge_files(args_go.gffs, self.result_all_path, self.out_all) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all) if args_go.trans is not None: print("Computing express CDS...") self._retrieve_go(args_go.uniprot, self.result_express_path, "express") self._merge_files(args_go.gffs, self.result_express_path, self.out_express) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express) self.helper.remove_tmp(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp(args_go.trans)
class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([ os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([ os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([ samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam ]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([ samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam") ]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n" ]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile( os.path.join(splice_path, header, self.splices["file"]), os.path.join( fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile( os.path.join(splice_path, header, self.trans["file"]), os.path.join( fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and (self.splices["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and (self.trans["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_trans) else: shutil.move( os.path.join( fasta_prefix, "_".join( [self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move( os.path.join( fasta_prefix, "_".join( [self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir( os.path.join(self.splice_path, prefix)): detect_circrna( os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and (args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S"], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile(os.path.join(splice_path, header, self.splices["file"]), os.path.join(fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile(os.path.join(splice_path, header, self.trans["file"]), os.path.join(fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and ( self.splices["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and ( self.trans["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_trans) else: shutil.move(os.path.join( fasta_prefix, "_".join([self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move(os.path.join( fasta_prefix, "_".join([self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir(os.path.join( self.splice_path, prefix)): detect_circrna(os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join( self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join( self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and ( args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class OperonDetection(object): def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op): for prefix in prefixs: out_table = os.path.join(self.table_path, "_".join(["operon", prefix + ".csv"])) print("Detection operons of {0}".format(prefix)) tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table) def _check_and_parser_gff(self, args_op): self._check_gff(args_op.tsss, "tss") self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self._check_gff(args_op.utr5s, "utr") self._check_gff(args_op.utr3s, "utr") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") self.multiparser.parser_gff(args_op.utr5s, "5UTR") self.multiparser.combine_gff(args_op.gffs, self.utr5_path, None, "5UTR") self.multiparser.parser_gff(args_op.utr3s, "3UTR") self.multiparser.combine_gff(args_op.gffs, self.utr3_path, None, "3UTR") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder): for table in os.listdir(table_path): if table.startswith("operon_") and table.endswith(".csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) def _combine_gff(self, prefixs, args_op): for prefix in prefixs: out_file = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "all_features.gff"])) print("Combine all features of {0}".format(prefix)) tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file(args_op.gffs, ".gff", prefix, None, None) utr5 = self.helper.get_correct_file(self.utr5_path, "_5UTR.gff", prefix, None, None) utr3 = self.helper.get_correct_file(self.utr3_path, "_3UTR.gff", prefix, None, None) if self.term_path is None: term = None else: term = self.helper.get_correct_file(self.term_path, "_term.gff", prefix, None, None) combine_gff(gff, tran, tss, utr5, utr3, term, args_op.tss_fuzzy, args_op.term_fuzzy, out_file) def run_operon(self, args_op): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op) if args_op.statistics: self._stat(self.table_path, args_op.stat_folder) if args_op.combine: self._combine_gff(prefixs, args_op) self.helper.remove_tmp(args_op.gffs) self.helper.remove_tmp(args_op.utr3s) self.helper.remove_tmp(args_op.utr5s) self.helper.remove_tmp(args_op.tsss) self.helper.remove_tmp(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp(args_op.terms)
class MEME(object): def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): print(os.path.join(input_path, fasta)) folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: print(prefix) input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp(args_pro.fastas) self.helper.remove_tmp(args_pro.tsss) self.helper.remove_tmp(args_pro.gffs) self.helper.remove_tmp(args_pro.wigs) if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine): if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: for folder in os.listdir(os.path.join(output_folder, strain)): tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") gen_promoter_table(os.path.join(output_folder, strain, folder, "meme.txt"), os.path.join(output_folder, strain, folder, "meme.csv"), tss_file) def _get_upstream(self, args_pro, prefix, tss, fasta): if args_pro.source: print("generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro) else: if (args_pro.gffs is None) or ( args_pro.wigs is None) or ( args_pro.input_libs is None): print("Error:please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("classifying TSS and extracting fasta {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("generating the table...") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine) self._remove_files(args_pro)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class sRNATargetPrediction(object): def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, vienna_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([os.path.join(vienna_path, "RNAplfold"), "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[1], "start": int(srna_datas[2]), "end": int(srna_datas[3])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files...") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files...") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_path, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([os.path.join(args_tar.vienna_path, "RNAplex"), "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_path], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_path = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_path) num_process = self._run_rnaplex(prefix, rnaplfold_path, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([os.path.join(args_tar.vienna_path, "RNAup"), "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now...".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target(rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or ( args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") if (args_tar.program == "both") or ( args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) if (args_tar.program == "RNAplex") or ( args_tar.program == "both"): for strain in os.listdir(os.path.join( args_tar.out_folder, "RNAplex")): shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex", strain, "RNAplfold")) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp(args_tar.gffs) self.helper.remove_tmp(args_tar.srnas) self.helper.remove_tmp(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": log.write("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: log.write("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder]) + "\n") p = Popen([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _rna_plex(self, prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold( args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder, log) self._run_rnaplfold( args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"]) + "\n") p = Popen([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write("Please make sure the version of IntaRNA is at least 2.0.4.\n") for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([args_tar.intarna_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, prefix + "_target.fa"), "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write("Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target(rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or ( line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup( line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format( args_sub.gram)) sys.exit() log.write("\t" + os.path.join(tmp_result, "_".join([ prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join(stat_folder, "_".join([ "stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class GoTermFinding(object): '''Retrieving the GO term''' def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_): prefixs = [] for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("Extracting GO terms of {0} from UniProt".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) def _merge_files(self, gffs, out_path, out_folder): '''merge the files according to the input genome folder''' folders = [] for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) def _stat(self, out_path, stat_path, go, goslim, out_folder): for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])), out_folder) self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) def run_go_term(self, args_go): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDSs") self._retrieve_go(args_go.uniprot, self.result_all_path, "all") self._merge_files(args_go.gffs, self.result_all_path, self.out_all) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all) if args_go.trans is not None: print("Computing express CDSs") self._retrieve_go(args_go.uniprot, self.result_express_path, "express") self._merge_files(args_go.gffs, self.result_express_path, self.out_express) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express) self.helper.remove_tmp_dir(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp_dir(args_go.trans)
class SubLocal(object): def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generate CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("transfer DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error:It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): if args_sub.merge: print("Merge to gff...") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp(args_sub.fastas) self.helper.remove_tmp(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now...") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now...") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class UTRDetection(object): '''detection of UTR''' def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics") def _check_folder(self, folder): if folder is None: print("Error: Lack required files!!!") sys.exit() def _check_gff(self, folder): for gff in os.listdir(folder): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, gff)) def _compute_utr(self, args_utr): for gff in os.listdir(args_utr.gffs): if gff.endswith(".gff"): prefix = gff[:-4] tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) if args_utr.terms: term = self.helper.get_correct_file( os.path.join(args_utr.terms, "tmp"), "_term.gff", prefix, None, None) else: term = None print("Computing 5'UTR of {0}".format(prefix)) detect_5utr( tss, os.path.join(args_utr.gffs, gff), tran, os.path.join(self.utr5_path, "gffs", "_".join([prefix, "5UTR.gff"])), args_utr) print("Computing 3'UTR of {0}".format(prefix)) detect_3utr( tran, os.path.join(args_utr.gffs, gff), term, os.path.join(self.utr3_path, "gffs", "_".join([prefix, "3UTR.gff"])), args_utr) self.helper.move_all_content(os.getcwd(), self.utr5_stat_path, ["_5utr_length.png"]) self.helper.move_all_content(os.getcwd(), self.utr3_stat_path, ["_3utr_length.png"]) def run_utr_detection(self, args_utr): self._check_folder(args_utr.tsss) self._check_folder(args_utr.gffs) self._check_folder(args_utr.trans) self._check_gff(args_utr.tsss) self._check_gff(args_utr.gffs) self._check_gff(args_utr.trans) self._check_gff(args_utr.terms) self.multiparser.parser_gff(args_utr.gffs, None) self.multiparser.parser_gff(args_utr.tsss, "TSS") self.multiparser.combine_gff(args_utr.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_utr.trans, "transcript") self.multiparser.combine_gff(args_utr.gffs, self.tran_path, None, "transcript") if args_utr.terms: self.multiparser.parser_gff(args_utr.terms, "term") self.multiparser.combine_gff(args_utr.gffs, os.path.join(args_utr.terms, "tmp"), None, "term") self._compute_utr(args_utr) self.helper.remove_tmp_dir(args_utr.gffs) self.helper.remove_tmp_dir(args_utr.tsss) self.helper.remove_tmp_dir(args_utr.trans) self.helper.remove_tmp_dir(args_utr.terms) self.helper.remove_tmp(self.utr5_path) self.helper.remove_tmp(self.utr3_path)
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = {"fasta": os.path.join( out_folder, "tmp_fasta"), "scan": os.path.join( out_folder, "tmp_scan"), "table": os.path.join( out_folder, "tmp_table")} rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = {"csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv"} return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") call([args_ribo.cmscan_path, "--incE", str(args_ribo.e_value), "--acc", rfam, seq], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) print("Pre-scanning of {0}".format(prefix)) first_scan_file = self._run_cmscan( args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("Scanning of {0}".format(prefix)) sec_scan_file = self._run_cmscan( args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam) sec_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) stat_and_covert2gff(os.path.join( table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, os.path.join(gff_outfolder, "_".join([prefix, feature + ".gff"])), args_ribo.fuzzy, out_stat, feature) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join(os.path.join( tmp_files["table"], "_".join([gff.replace(".gff", ""), suffixs["csv"]]))), os.path.join(gff_path, gff)) def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder): '''main part of detection''' rbs_from_rfam(feature_id, args_ribo.rfam, rfam) print("Compressing Rfam of " + feature) call([args_ribo.cmpress_path, "-F", rfam]) prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam( prefixs, args_ribo, tmp_files, suffixs, feature, rfam) self._remove_overlap(self.gff_path, tmp_files, suffixs) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature) mapping_ribos(table_folder, feature_id, feature) self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction( args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder) self._remove_tmp(args_ribo)
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or (args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or (args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = { "fasta": os.path.join(out_folder, "tmp_fasta"), "scan": os.path.join(out_folder, "tmp_scan"), "table": os.path.join(out_folder, "tmp_table") } rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = { "csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv" } return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam, log): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") if args_ribo.cutoff.split("_")[0] == "e": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join( [args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq], stdout=scan) elif args_ribo.cutoff.split("_")[0] == "s": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join( [args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq], stdout=scan) else: print("Error: the --cutoff needs to start from 'e' " "(e value) or 's' (score)!") log.write("the --cutoff needs to start from 'e' " "(e value) or 's' (score).\n") sys.exit() scan.close() log.write("Done!\n") log.write("\t" + scan_file + " is temporary generated.\n") return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) if self.tss_path is not None: tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff") else: tss_file = None log.write("Running extract_RBS.py to extract potential " "sequences of riboswitches/RNA thermometers for " "{0}.\n".format(prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), tss_file, os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) log.write("\t" + first_seq + " is temporary generated.\n") print("Pre-scanning of {0}".format(prefix)) log.write("Using Infernal to pre-scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write( "Please make sure the version of Infernal is at least 1.1.1.\n" ) first_scan_file = self._run_cmscan(args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam, log) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join(tmp_files["table"], "_".join([prefix, suffixs["csv"]])) log.write( "Running recompute_RBS.py to update the potential " "sequences of riboswitches/RNA thermometers for {0} " "based on the pre-scanning results.\n".format(prefix)) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) log.write("\t" + sec_seq + " is temporary generated.\n") print("Scanning of {0}".format(prefix)) log.write("Using Infernal to scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at " "least 1.1.1.\n") sec_scan_file = self._run_cmscan(args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam, log) sec_table = os.path.join(tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) log.write("Running recompute_RBS.py and modify_rbs_table.py " "to generate tables for {0} " "based on the scanning results.\n".format(prefix)) reextract_rbs(sec_scan_file, first_table, sec_table, args_ribo.cutoff) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) log.write("Merging the results from Infernal to generate " "tables for {0}.\n".format(prefix)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile( os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file( os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy( os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy( os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id log.write("The following files are generated.\n") for folder in (table_folder, scan_folder): for file_ in os.listdir(folder): log.write("\t" + os.path.join(folder, file_) + "\n") out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) log.write("Running ribo_gff.py to do statistics and generate " "gff files for {0}.\n".format(prefix)) log.write("The following files are generated:\n") out_gff = os.path.join(gff_outfolder, "_".join([prefix, feature + ".gff"])) stat_and_covert2gff( os.path.join(table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, out_gff, args_ribo.fuzzy, out_stat, feature) log.write("\t" + out_gff + "\n") log.write("\t" + out_stat + "\n") fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log): log.write("Running rbs_overlap.py to remove the overlapping " "riboswitches/RNA thermometers.\n") for gff in os.listdir(gff_path): if gff.endswith(".gff"): tmp_table = os.path.join( os.path.join( tmp_files["table"], "_".join([gff.replace(".gff", ""), suffixs["csv"]]))) rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_, fuzzy) log.write("\t" + tmp_table + " is updated.\n") def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder, type_, log): '''main part of detection''' log.write("Running get_Rfam_ribo.py to get the information of " "riboswitches/RNA thermometers from Rfam.\n") rbs_from_rfam(feature_id, args_ribo.rfam, rfam) log.write("Using Infernal to compress the Rfam data of " "riboswitches/RNA thermometers.\n") log.write( "Please make sure the version of Infernal is at least 1.1.1.\n") print("Compressing Rfam of " + feature) log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n") call([args_ribo.cmpress_path, "-F", rfam]) log.write("Done!\n") prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log) self._remove_overlap(self.gff_path, tmp_files, suffixs, type_, args_ribo.fuzzy, log) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log) log.write( "Running map_ribos.py to extract all the details from Rfam.\n") mapping_ribos(table_folder, feature_id, feature) log.write("The following files are updated:\n") for file_ in os.listdir(table_folder): log.write("\t" + os.path.join(table_folder, file_) + "\n") self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo, log_t, log_r): if args_ribo.fuzzy_rbs > 6: if log_t is not None: log_t.write("--fuzzy_rbs should be equal or less than 6!\n") if log_r is not None: log_r.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") if args_ribo.tsss is not None: self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or (args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder, "riboswitch", log_r) if (args_ribo.program.lower() == "both") or (args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction(args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder, "thermometer", log_t) self._remove_tmp(args_ribo)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): '''compare CDS and transcript to find the expressed CDS''' out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("Transfering DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error: It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): '''extract the result of psortb''' if args_sub.merge: print("Merging gff") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_long_id(self, seq_file, long_ids, type_): out_file = seq_file + "_tmp.fa" out = open(out_file, "w") with open(seq_file) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): if len(line) > 40: long_ids[type_].append(line[1:]) out.write(">TMP" + type_ + "_" + str(len(long_ids[type_])) + "\n") else: out.write(line + "\n") else: out.write(line + "\n") out.close() return out_file def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, long_ids, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": srna_seq_file = os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) out_file = self._check_long_id(srna_seq_file, long_ids, "srna") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) else: tar_seq_file = os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) for tar_seq_file in os.listdir(os.path.join(current, seq_path)): if (prefix + "_" + file_type + "_") in tar_seq_file: out_file = self._check_long_id( os.path.join(current, seq_path, tar_seq_file), long_ids, "tar") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, target_prefixs, args_tar): print("Generating sRNA fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") target_prefixs.append(prefix) detect = False for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar, target_prefixs) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) if os.path.exists(sub_prefix + ".fa"): sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join( [sub_prefix, str(file_num) + ".fa"]), "w") detect = True sub_out.write(line + "\n") sub_out.close() else: open(sub_prefix + ".fa", "w").close() if not detect: print("No assigned features can be found. " "Please check your genome annotation. " "And assign correct features to --target_feature.") sys.exit() print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if ("_target_" in seq) and (".fa_tmp.fa" in seq): print("Running RNAplex with {0}".format( seq.replace(".fa_tmp.fa", ""))) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ]) + "\n") p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _restore_long_ids(self, rnaplex_file, long_ids): out = open(rnaplex_file + "tmp", "w") with open(rnaplex_file, "r") as t_f: for line in t_f: line = line.strip() if (line.startswith(">")): if (line.startswith(">TMPtar_")): header = long_ids["tar"][int(line.split("_")[1]) - 1] elif (line.startswith(">TMPsrna_")): header = long_ids["srna"][int(line.split("_")[1]) - 1] else: header = line[1:] out.write(">" + header + "\n") else: out.write(line + "\n") out.close() shutil.move(rnaplex_file + "tmp", rnaplex_file) def _rna_plex(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") tmp_rnaplfold_folder = os.path.join(self.rnaplex_path, "tmp_RNAplfold") if os.path.exists(tmp_rnaplfold_folder): shutil.rmtree(tmp_rnaplfold_folder) os.mkdir(tmp_rnaplfold_folder) long_ids = {"tar": [], "srna": []} for prefix in target_prefixs: self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, long_ids, self.target_seq_path, prefix, tmp_rnaplfold_folder, log) for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") shutil.copytree(tmp_rnaplfold_folder, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, long_ids, self.srna_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) if (len(long_ids["tar"]) != 0) or (len(long_ids["srna"]) != 0): self._restore_long_ids(rnaplex_file, long_ids) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ]) + "\n") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() for prefix in target_prefixs: self.helper.merge_file( os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join([ self.tmps["tmp"], str(num_up), ".fa" ]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, target_prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write( "Please make sure the version of IntaRNA is at least 2.0.4.\n") all_target = os.path.join(self.target_seq_path, "all_target.fa") if os.path.exists(all_target): os.remove(all_target) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.target_seq_path, prefix + "_target.fa"), all_target) for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([ args_tar.intarna_path, "-q", os.path.join(self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", all_target, "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file ]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, target_prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write( "Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") all_gff = os.path.join(self.gff_path, "all.gff") if os.path.exists(all_gff): os.remove(all_gff) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.gff_path, prefix + ".gff"), all_gff) for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join( self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target( rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), all_gff, target_prefixs) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or (line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup(line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] target_prefixs = [] self._gen_seq(prefixs, target_prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") shutil.rmtree(os.path.join(self.rnaplex_path, "tmp_RNAplfold")) log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, target_prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, target_prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file") os.remove(os.path.join(self.target_seq_path, "all_target.fa"))
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = {"fasta": os.path.join( out_folder, "tmp_fasta"), "scan": os.path.join( out_folder, "tmp_scan"), "table": os.path.join( out_folder, "tmp_table")} rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = {"csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv"} return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam, log): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") if args_ribo.cutoff.split("_")[0] == "e": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq], stdout=scan) elif args_ribo.cutoff.split("_")[0] == "s": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq], stdout=scan) else: print("Error: the --cutoff needs to start from 'e' " "(e value) or 's' (score)!") log.write("the --cutoff needs to start from 'e' " "(e value) or 's' (score).\n") sys.exit() scan.close() log.write("Done!\n") log.write("\t" + scan_file + " is temporary generated.\n") return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) if self.tss_path is not None: tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff") else: tss_file = None log.write("Running extract_RBS.py to extract potential " "sequences of riboswitches/RNA thermometers for " "{0}.\n".format(prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), tss_file, os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) log.write("\t" + first_seq + " is temporary generated.\n") print("Pre-scanning of {0}".format(prefix)) log.write("Using Infernal to pre-scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at least 1.1.1.\n") first_scan_file = self._run_cmscan( args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam, log) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["csv"]])) log.write("Running recompute_RBS.py to update the potential " "sequences of riboswitches/RNA thermometers for {0} " "based on the pre-scanning results.\n".format(prefix)) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) log.write("\t" + sec_seq + " is temporary generated.\n") print("Scanning of {0}".format(prefix)) log.write("Using Infernal to scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at " "least 1.1.1.\n") sec_scan_file = self._run_cmscan( args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam, log) sec_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) log.write("Running recompute_RBS.py and modify_rbs_table.py " "to generate tables for {0} " "based on the scanning results.\n".format(prefix)) reextract_rbs(sec_scan_file, first_table, sec_table, args_ribo.cutoff) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) log.write("Merging the results from Infernal to generate " "tables for {0}.\n".format(prefix)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id log.write("The following files are generated.\n") for folder in (table_folder, scan_folder): for file_ in os.listdir(folder): log.write("\t" + os.path.join(folder, file_) + "\n") out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) log.write("Running ribo_gff.py to do statistics and generate " "gff files for {0}.\n".format(prefix)) log.write("The following files are generated:\n") out_gff = os.path.join(gff_outfolder, "_".join([ prefix, feature + ".gff"])) stat_and_covert2gff(os.path.join( table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, out_gff, args_ribo.fuzzy, out_stat, feature) log.write("\t" + out_gff + "\n") log.write("\t" + out_stat + "\n") fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log): log.write("Running rbs_overlap.py to remove the overlapping " "riboswitches/RNA thermometers.\n") for gff in os.listdir(gff_path): if gff.endswith(".gff"): tmp_table = os.path.join(os.path.join( tmp_files["table"], "_".join([ gff.replace(".gff", ""), suffixs["csv"]]))) rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_, fuzzy) log.write("\t" + tmp_table + " is updated.\n") def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder, type_, log): '''main part of detection''' log.write("Running get_Rfam_ribo.py to get the information of " "riboswitches/RNA thermometers from Rfam.\n") rbs_from_rfam(feature_id, args_ribo.rfam, rfam) log.write("Using Infernal to compress the Rfam data of " "riboswitches/RNA thermometers.\n") log.write("Please make sure the version of Infernal is at least 1.1.1.\n") print("Compressing Rfam of " + feature) log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n") call([args_ribo.cmpress_path, "-F", rfam]) log.write("Done!\n") prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam( prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log) self._remove_overlap(self.gff_path, tmp_files, suffixs, type_, args_ribo.fuzzy, log) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log) log.write("Running map_ribos.py to extract all the details from Rfam.\n") mapping_ribos(table_folder, feature_id, feature) log.write("The following files are updated:\n") for file_ in os.listdir(table_folder): log.write("\t" + os.path.join(table_folder, file_) + "\n") self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo, log_t, log_r): if args_ribo.fuzzy_rbs > 6: if log_t is not None: log_t.write("--fuzzy_rbs should be equal or less than 6!\n") if log_r is not None: log_r.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") if args_ribo.tsss is not None: self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder, "riboswitch", log_r) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction( args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder, "thermometer", log_t) self._remove_tmp(args_ribo)
class OperonDetection(object): '''detection of operon''' def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() if args_op.tsss is not None: self.tss_path = os.path.join(args_op.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_op.trans, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op, log): log.write("Running detect_operon.py to detect operon.\n") log.write("The the following files are generated:\n") for prefix in prefixs: out_gff = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "operon.gff"])) out_table = os.path.join(self.table_path, "_".join([prefix, "operon.csv"])) print("Detecting operons of {0}".format(prefix)) if self.tss_path is None: tss = False else: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table, out_gff) log.write("\t" + out_table + "\n") log.write("\t" + out_gff + "\n") def _check_and_parser_gff(self, args_op): self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") if args_op.tsss is not None: self._check_gff(args_op.tsss, "tss") self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder, log): log.write("Running stat_operon.py to do statistics.\n") for table in os.listdir(table_path): if table.endswith("_operon.csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) log.write("\t" + out_stat + "\n") def run_operon(self, args_op, log): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op, log) self._stat(self.table_path, args_op.stat_folder, log) self.helper.remove_tmp_dir(args_op.gffs) self.helper.remove_tmp_dir(args_op.tsss) self.helper.remove_tmp_dir(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp_dir(args_op.terms)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({ "sample": reads["sample"], "files": tmp_datas, "zips": zips }) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) + "\n") call([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ]) + "\n") p = Popen([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write( "Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join( [read_prefix, fasta_prefix + ".sam"]) log_file = "_".join( [read_prefix, fasta_prefix + ".log"]) align_files.append("_".join( [read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir( os.path.join(self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write( " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write( "Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write( " ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join( [samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join( [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) + "\n") call([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join(read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write( "Please make sure the version of Segemehl is at least 0.1.9.\n") log.write( "Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed") ]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join(output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join(splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join(splice_path, header, splice), os.path.join(bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.splices["file"] ])) out_trans = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write( "Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder( os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join(args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join( args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join(self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({ "sample": datas[0], "files": datas[-1].split(",") }) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix + ".bam")) bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed(args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class sRNADetection(object): '''detection of sRNA''' def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") } def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([ os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file ], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or (database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): '''merge the results of fragmented and tex treated libs''' if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _read_lib_wig(self, args_srna): libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) return [libs, texs, wigs_f, wigs_r] def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): '''detection of intergenic and antisense sRNA''' tex_datas = None frag_datas = None if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if self.tss_path is not None: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file(self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) frag_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) tex_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if ("TSS_class" in os.listdir( args_srna.out_folder)) and (not args_srna.tss_source): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss, frag_datas, tex_datas def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas): '''detection of UTR-derived sRNA''' if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.import_info is not None: if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( args_srna.nr_database is not None) or (args_srna.srna_database is not None): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _merge_tex_frag_datas(self, tex_datas, frag_datas): if (tex_datas is not None) and (frag_datas is not None): for index in [2, 3]: for strain, conds in frag_datas[index].items(): if strain not in tex_datas[index].keys(): tex_datas[index][strain] = conds else: for cond, tracks in conds.items(): tex_datas[index][strain][cond] = tracks elif (tex_datas is None) and (frag_datas is not None): tex_datas = frag_datas return tex_datas def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gffs = { "merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix]) } csvs = { "merge": "_".join([self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([self.prefixs["normal_table"], prefix]) } tss, frag_datas, tex_datas = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas) tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas) del frag_datas gc.collect() self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss, tex_datas) del tex_datas filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss, tex_datas): print("merging data of sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join([ "cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file ])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): '''extract the sec str energy''' detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy( os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return { "sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path) } def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([ os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file) ])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([ os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file) ], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec(self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([ os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file ]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0} in {1}".format(prefix, database)) seq_file = os.path.join(args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): '''classify the sRNA based on the filters''' if (args_srna.import_info is not None) or (args_srna.srna_database is not None) or ( args_srna.nr_database is not None) or (self.sorf_path is not None) or ( self.tss_path is not None) or (self.term_path is not None) or ( args_srna.promoter_table is not None): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join( class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join(["stat_sRNA_class", prefix + ".csv"])) classify_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join(class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _get_best_result(self, prefixs, args_srna): '''get the best results based on the filters''' for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table( os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table, self.term_path) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) self.helper.remove_tmp(self.gff_output) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): '''set the filter of sRNA''' if args_srna.import_info is not None: if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if args_srna.nr_database is not None: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if self.sorf_path is not None: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) if args_srna.srna_database is not None: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): '''filter out the low expressed UTR-derived sRNA''' for prefix in prefixs: filter_utr( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): '''detection of antisense''' for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): '''do statistics for blast result''' for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): '''compare sRNA with terminator and promoter''' if self.term_path is not None: compare_srna_term( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if (args_srna.promoter_table is not None): compare_srna_promoter( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") if args_srna.import_info is not None: args_srna.import_info = self._import_info_format( args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter( "_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if args_srna.srna_database is not None: if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": os.system("<".join([ command, os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) ])) else: os.system("<".join([ command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) ])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join( [self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file( os.path.join(self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join( [self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): '''merge the result of RNAup and RNAplex''' for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target( rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") # if (args_tar.program == "RNAplex") or ( # args_tar.program == "both"): # for strain in os.listdir(os.path.join( # args_tar.out_folder, "RNAplex_results")): # shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex_results", # strain, "RNAplfold")) if (args_tar.program == "both") or (args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if (pre_strain != strain): num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain == 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class OperonDetection(object): def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None def _check_gff(self, gffs, type_): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _detect_operon(self, prefixs, args_op): for prefix in prefixs: out_table = os.path.join(self.table_path, "_".join(["operon", prefix + ".csv"])) print("Detection operons of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) if self.term_path is None: term = False else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) operon(tran, tss, gff, term, args_op.tss_fuzzy, args_op.term_fuzzy, args_op.length, out_table) def _check_and_parser_gff(self, args_op): self._check_gff(args_op.tsss, "tss") self._check_gff(args_op.gffs, "gff") self._check_gff(args_op.trans, "tran") self._check_gff(args_op.utr5s, "utr") self._check_gff(args_op.utr3s, "utr") self.multiparser.parser_gff(args_op.gffs, None) self.multiparser.parser_gff(args_op.tsss, "TSS") self.multiparser.combine_gff(args_op.gffs, self.tss_path, None, "TSS") self.multiparser.parser_gff(args_op.trans, "transcript") self.multiparser.combine_gff(args_op.gffs, self.tran_path, None, "transcript") self.multiparser.parser_gff(args_op.utr5s, "5UTR") self.multiparser.combine_gff(args_op.gffs, self.utr5_path, None, "5UTR") self.multiparser.parser_gff(args_op.utr3s, "3UTR") self.multiparser.combine_gff(args_op.gffs, self.utr3_path, None, "3UTR") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.multiparser.parser_gff(args_op.terms, "term") self.multiparser.combine_gff(args_op.gffs, self.term_path, None, "term") def _stat(self, table_path, stat_folder): for table in os.listdir(table_path): if table.startswith("operon_") and table.endswith(".csv"): filename = "_".join(["stat", table]) out_stat = os.path.join(stat_folder, filename) stat(os.path.join(table_path, table), out_stat) def _combine_gff(self, prefixs, args_op): for prefix in prefixs: out_file = os.path.join(args_op.output_folder, "gffs", "_".join([prefix, "all_features.gff"])) print("Combine all features of {0}".format(prefix)) tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gff = self.helper.get_correct_file( args_op.gffs, ".gff", prefix, None, None) utr5 = self.helper.get_correct_file( self.utr5_path, "_5UTR.gff", prefix, None, None) utr3 = self.helper.get_correct_file( self.utr3_path, "_3UTR.gff", prefix, None, None) if self.term_path is None: term = None else: term = self.helper.get_correct_file( self.term_path, "_term.gff", prefix, None, None) combine_gff(gff, tran, tss, utr5, utr3, term, args_op.tss_fuzzy, args_op.term_fuzzy, out_file) def run_operon(self, args_op): self._check_and_parser_gff(args_op) prefixs = [] for gff in os.listdir(args_op.gffs): if gff.endswith(".gff"): prefixs.append(gff.replace(".gff", "")) self._detect_operon(prefixs, args_op) if args_op.statistics: self._stat(self.table_path, args_op.stat_folder) if args_op.combine: self._combine_gff(prefixs, args_op) self.helper.remove_tmp(args_op.gffs) self.helper.remove_tmp(args_op.utr3s) self.helper.remove_tmp(args_op.utr5s) self.helper.remove_tmp(args_op.tsss) self.helper.remove_tmp(args_op.trans) if args_op.terms is not None: self.helper.remove_tmp(args_op.terms)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
class GoTermFinding(object): '''Retrieving the GO term''' def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_, log): prefixs = [] log.write("Running gene_ontology.py to retrieve GO terms.\n") for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("Extracting GO terms of {0} from UniProt".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) log.write("\t" + out_file + " is generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["Genome", "Strand", "Start", "End", "Protein_id", "Go_term"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_files(self, gffs, out_path, out_folder, log): '''merge the files according to the input genome folder''' folders = [] log.write("Merging the output files based on the input genome " "information.\n") for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) self._remove_header(out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) for file_ in os.listdir(os.path.join(out_path, folder_prefix)): log.write("\t" + os.path.join(out_path, folder_prefix, file_) + " is generated.\n") def _stat(self, out_path, stat_path, go, goslim, out_folder, log): log.write("Running gene_ontology.py to Retrieve GOslim terms and " "do statistics.\n") log.write("The following files are generated:\n") for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) stat_file = os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), stat_file, out_folder) log.write("\t" + stat_file + "\n") self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) for file_ in os.listdir(fig_path): log.write("\t" + os.path.join(fig_path, file_) + "\n") def run_go_term(self, args_go, log): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDSs") log.write("Retrieving GO terms for all CDSs.\n") self._retrieve_go(args_go.uniprot, self.result_all_path, "all", log) self._merge_files(args_go.gffs, self.result_all_path, self.out_all, log) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all, log) if args_go.trans is not None: log.write("Retrieving GO terms only for expressed CDSs.\n") print("Computing express CDSs") self._retrieve_go(args_go.uniprot, self.result_express_path, "express", log) self._merge_files(args_go.gffs, self.result_express_path, self.out_express, log) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express, log) self.helper.remove_tmp_dir(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp_dir(args_go.trans)
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = { "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join(args_ribo.out_folder, "tmp_scan"), "table": os.path.join(args_ribo.out_folder, "tmp_table") } self.suffixs = { "csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv" } def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([ os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq ], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder( os.path.join(self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join( [entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff( os.path.join(self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join(self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join( os.path.join( self.tmp_files["table"], "_".join( [gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([ os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam ]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({"sample": reads["sample"], "files": tmp_datas, "zips": zips}) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) + "\n") call([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"]) + "\n") p = Popen([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write("Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir(os.path.join( self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write(" ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join(out_folder, "_".join([ prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write(" ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join([samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join(out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join([samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n") call([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file( self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join( read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({"sample": read_data["sample"], "files": bam_files}) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write("Please make sure the version of Segemehl is at least 0.1.9.\n") log.write("Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed")]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join( output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join( splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join( splice_path, header, splice), os.path.join( bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.splices["file"]])) out_trans = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write("Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join( args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join(args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join( self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({"sample": datas[0], "files": datas[-1].split(",")}) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix + ".bam")) bam_datas.append({"sample": read_data["sample"], "files": bam_files}) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed( args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)